001 /*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence. This should
006 * be distributed with the code. If you do not have a copy,
007 * see:
008 *
009 * http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors. These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 * http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @auther Scooter Willis
023 *
024 */
025 package org.biojava3.core.sequence.loader;
026
027 import java.io.BufferedReader;
028 import java.io.ByteArrayInputStream;
029 import java.io.File;
030 import java.io.FileReader;
031 import java.io.FileWriter;
032 import java.io.InputStreamReader;
033 import java.net.URL;
034 import java.net.URLConnection;
035 import java.util.ArrayList;
036 import java.util.Iterator;
037 import java.util.LinkedHashMap;
038 import java.util.List;
039 import java.util.logging.Logger;
040 import org.biojava3.core.sequence.AccessionID;
041
042 import org.biojava3.core.sequence.template.SequenceProxyView;
043 import org.biojava3.core.sequence.template.Compound;
044 import org.biojava3.core.exceptions.CompoundNotFoundError;
045 import org.biojava3.core.sequence.ProteinSequence;
046 import org.biojava3.core.sequence.Strand;
047 import org.biojava3.core.sequence.compound.AminoAcidCompound;
048 import org.biojava3.core.sequence.compound.AminoAcidCompoundSet;
049 import org.biojava3.core.sequence.features.DBReferenceInfo;
050 import org.biojava3.core.sequence.features.DatabaseReferenceInterface;
051 import org.biojava3.core.sequence.features.FeaturesKeyWordInterface;
052
053 import org.biojava3.core.sequence.storage.SequenceAsStringHelper;
054 import org.biojava3.core.sequence.template.CompoundSet;
055 import org.biojava3.core.sequence.template.ProxySequenceReader;
056 import org.biojava3.core.sequence.template.SequenceMixin;
057 import org.biojava3.core.sequence.template.SequenceView;
058 import org.biojava3.core.util.XMLHelper;
059 import org.w3c.dom.Document;
060 import org.w3c.dom.Element;
061
062 /**
063 *
064 * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements
065 * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3
066 * ProteinSequence.
067 * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does
068 * not manage cache.
069 * @param <C>
070 */
071
072 public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface {
073
074 private static final Logger logger = Logger.getLogger(UniprotProxySequenceReader.class.getName());
075 private static String uniprotbaseURL = "http://www.uniprot.org"; //"http://pir.uniprot.org";
076 private static String uniprotDirectoryCache = null;
077
078
079 private String sequence;
080 private CompoundSet<C> compoundSet;
081 private List<C> parsedCompounds = new ArrayList<C>();
082 Document uniprotDoc;
083
084
085 /**
086 * The uniprot id is used to retrieve the uniprot XML which is then parsed as a DOM object
087 * so we know everything about the protein. If an error occurs throw an exception. We could
088 * have a bad uniprot id or network error
089 * @param accession
090 * @param compoundSet
091 * @throws Exception
092 */
093 public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws Exception {
094
095 setCompoundSet(compoundSet);
096 uniprotDoc = this.getUniprotXML(accession);
097 String seq = this.getSequence(uniprotDoc);
098 setContents(seq);
099 }
100
101 public void setCompoundSet(CompoundSet<C> compoundSet) {
102 this.compoundSet = compoundSet;
103 }
104
105 /**
106 * Once the sequence is retrieved set the contents and make sure everything this is valid
107 * @param sequence
108 */
109 public void setContents(String sequence) {
110 // Horrendously inefficient - pretty much the way the old BJ did things.
111 // TODO Should be optimised.
112 this.sequence = sequence;
113 this.parsedCompounds.clear();
114 for (int i = 0; i < sequence.length();) {
115 String compoundStr = null;
116 C compound = null;
117 for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) {
118 compoundStr = sequence.substring(i, i + compoundStrLength);
119 compound = compoundSet.getCompoundForString(compoundStr);
120 }
121 if (compound == null) {
122 throw new CompoundNotFoundError(compoundStr);
123 } else {
124 i += compoundStr.length();
125 }
126 this.parsedCompounds.add(compound);
127 }
128 }
129
130 /**
131 * The sequence length
132 * @return
133 */
134 public int getLength() {
135 return this.parsedCompounds.size();
136 }
137
138 /**
139 *
140 * @param position
141 * @return
142 */
143 public C getCompoundAt(int position) {
144 return this.parsedCompounds.get(position - 1);
145 }
146 /**
147 *
148 * @param compound
149 * @return
150 */
151 public int getIndexOf(C compound) {
152 return this.parsedCompounds.indexOf(compound) + 1;
153 }
154 /**
155 *
156 * @param compound
157 * @return
158 */
159 public int getLastIndexOf(C compound) {
160 return this.parsedCompounds.lastIndexOf(compound) + 1;
161 }
162 /**
163 *
164 * @return
165 */
166
167 @Override
168 public String toString() {
169 return getSequenceAsString();
170 }
171 /**
172 *
173 * @return
174 */
175 public String getSequenceAsString() {
176 return sequence;
177 }
178 /**
179 *
180 * @return
181 */
182 public List<C> getAsList() {
183 return this.parsedCompounds;
184 }
185 /**
186 *
187 * @return
188 */
189 @Override
190 public SequenceView<C> getInverse() {
191 return SequenceMixin.inverse(this);
192 }
193 /**
194 *
195 * @param bioBegin
196 * @param bioEnd
197 * @param strand
198 * @return
199 */
200
201 public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) {
202 SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>();
203 return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand);
204 }
205 /**
206 *
207 * @param bioBegin
208 * @param bioEnd
209 * @return
210 */
211 public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) {
212 return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd);
213 }
214 /**
215 *
216 * @return
217 */
218 public Iterator<C> iterator() {
219 return this.parsedCompounds.iterator();
220 }
221 /**
222 *
223 * @return
224 */
225 public CompoundSet<C> getCompoundSet() {
226 return compoundSet;
227 }
228 /**
229 *
230 * @return
231 */
232
233 public AccessionID getAccession() {
234 throw new UnsupportedOperationException("Not supported yet.");
235 }
236 /**
237 *
238 * @param compounds
239 * @return
240 */
241
242 public int countCompounds(C... compounds) {
243 throw new UnsupportedOperationException("Not supported yet.");
244 }
245 /**
246 *
247 * @param accession
248 * @return
249 * @throws Exception
250 */
251 private Document getUniprotXML(String accession) throws Exception {
252 int index = accession.lastIndexOf(".");
253 String key = accession;
254 if (index != -1) {
255 key = accession.substring(0, index);
256 }
257 StringBuilder sb = new StringBuilder();
258 File f = null;
259 if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) {
260 f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml");
261 if (f.exists()) {
262 FileReader fr = new FileReader(f);
263 int size = (int) f.length();
264 char[] data = new char[size];
265 fr.read(data);
266 fr.close();
267 sb.append(data);
268 index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
269 if (index != -1) {
270 int lastIndex = sb.indexOf(">", index);
271 sb.replace(index, lastIndex, "");
272 }
273 }
274
275 }
276 if (sb.length() == 0) {
277 String uniprotURL = getUniprotbaseURL() + "/uniprot/" + key + ".xml";
278 logger.info("Loading " + uniprotURL);
279 URL uniprot = new URL(uniprotURL);
280 URLConnection uniprotConnection = uniprot.openConnection();
281 BufferedReader in = new BufferedReader(
282 new InputStreamReader(
283 uniprotConnection.getInputStream()));
284 String inputLine;
285
286 while ((inputLine = in.readLine()) != null) {
287 sb.append(inputLine);
288 }
289 in.close();
290 index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
291 if (index != -1) {
292 int lastIndex = sb.indexOf(">", index);
293 sb.replace(index, lastIndex, "");
294 }
295 if (f != null) {
296 FileWriter fw = new FileWriter(f);
297 fw.write(sb.toString());
298 fw.close();
299 }
300 }
301
302 logger.info("Load complete");
303 try {
304 // System.out.println(sb.toString());
305 Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes()));
306 return document;
307 } catch (Exception e) {
308 System.out.println("Exception on xml parse of:" + sb.toString());
309 }
310 return null;
311 }
312 /**
313 *
314 * @param uniprotDoc
315 * @return
316 * @throws Exception
317 */
318 private String getSequence(Document uniprotDoc) throws Exception {
319 Element uniprotElement = uniprotDoc.getDocumentElement();
320 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
321 Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence");
322
323 String seqdata = sequenceElement.getTextContent();
324
325 return seqdata;
326 }
327
328 /**
329 * The current unirpot URL to deal with caching issues. www.uniprot.org is loaded balanced
330 * but you can access pir.uniprot.org directly.
331 * @return the uniprotbaseURL
332 */
333 public static String getUniprotbaseURL() {
334 return uniprotbaseURL;
335 }
336
337 /**
338 * @param aUniprotbaseURL the uniprotbaseURL to set
339 */
340 public static void setUniprotbaseURL(String aUniprotbaseURL) {
341 uniprotbaseURL = aUniprotbaseURL;
342 }
343
344 /**
345 * Local directory cache of XML that can be downloaded
346 * @return the uniprotDirectoryCache
347 */
348 public static String getUniprotDirectoryCache() {
349 return uniprotDirectoryCache;
350 }
351
352 /**
353 * @param aUniprotDirectoryCache the uniprotDirectoryCache to set
354 */
355 public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) {
356 File f = new File(aUniprotDirectoryCache);
357 if(f.exists() == false){
358 f.mkdirs();
359 }
360 uniprotDirectoryCache = aUniprotDirectoryCache;
361 }
362
363 public static void main(String[] args) {
364
365 try {
366 UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet());
367 ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence);
368
369 System.out.println("Sequence=" + proteinSequence.getSequenceAsString());
370 } catch (Exception e) {
371 e.printStackTrace();
372 }
373
374 }
375
376 /**
377 * Get the gene name associated with this sequence.
378 * @return
379 * @throws Exception
380 */
381
382 public String getGeneName() throws Exception{
383 if(uniprotDoc == null)
384 return "";
385 Element uniprotElement = uniprotDoc.getDocumentElement();
386 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
387 Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene");
388 if(geneElement == null)
389 return "";
390 Element nameElement = XMLHelper.selectSingleElement(geneElement, "name");
391 if(nameElement == null)
392 return "";
393 return nameElement.getTextContent();
394 }
395
396 /**
397 * Get the organism name assigned to this sequence
398 * @return
399 * @throws Exception
400 */
401
402 public String getOrganismName() throws Exception{
403 if(uniprotDoc == null)
404 return "";
405 Element uniprotElement = uniprotDoc.getDocumentElement();
406 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
407 Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism");
408 if(organismElement == null)
409 return "";
410 Element nameElement = XMLHelper.selectSingleElement(organismElement, "name");
411 if(nameElement == null)
412 return "";
413 return nameElement.getTextContent();
414 }
415
416 /**
417 * Pull uniprot key words which is a mixed bag of words associated with this sequence
418 * @return
419 * @throws Exception
420 */
421
422 public ArrayList<String> getKeyWords() throws Exception {
423 ArrayList<String> keyWordsList = new ArrayList<String>();
424 if (uniprotDoc == null) {
425 return keyWordsList;
426 }
427 Element uniprotElement = uniprotDoc.getDocumentElement();
428 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
429 ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword");
430 for (Element element : keyWordElementList) {
431 keyWordsList.add(element.getTextContent());
432 }
433
434 return keyWordsList;
435 }
436
437 /**
438 * The Uniprot mappings to other database identifiers for this sequence
439 * @return
440 * @throws Exception
441 */
442
443 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() throws Exception {
444 LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
445 if (uniprotDoc == null) {
446 return databaseReferencesHashMap;
447 }
448
449 Element uniprotElement = uniprotDoc.getDocumentElement();
450 Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
451 ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference");
452 for (Element element : dbreferenceElementList) {
453 String type = element.getAttribute("type");
454 String id = element.getAttribute("id");
455 ArrayList<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type);
456 if (idlist == null) {
457 idlist = new ArrayList<DBReferenceInfo>();
458 databaseReferencesHashMap.put(type, idlist);
459 }
460 DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id);
461 ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property");
462 for (Element propertyElement : propertyElementList) {
463 String propertyType = propertyElement.getAttribute("type");
464 String propertyValue = propertyElement.getAttribute("value");
465 dbreferenceInfo.addProperty(propertyType, propertyValue);
466 }
467
468 idlist.add(dbreferenceInfo);
469 }
470
471
472 return databaseReferencesHashMap;
473 }
474 }