001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on 01-21-2010
021     *
022     * @auther Scooter Willis
023     *
024     */
025    package org.biojava3.core.sequence.loader;
026    
027    import java.io.BufferedReader;
028    import java.io.ByteArrayInputStream;
029    import java.io.File;
030    import java.io.FileReader;
031    import java.io.FileWriter;
032    import java.io.InputStreamReader;
033    import java.net.URL;
034    import java.net.URLConnection;
035    import java.util.ArrayList;
036    import java.util.Iterator;
037    import java.util.LinkedHashMap;
038    import java.util.List;
039    import java.util.logging.Logger;
040    import org.biojava3.core.sequence.AccessionID;
041    
042    import org.biojava3.core.sequence.template.SequenceProxyView;
043    import org.biojava3.core.sequence.template.Compound;
044    import org.biojava3.core.exceptions.CompoundNotFoundError;
045    import org.biojava3.core.sequence.ProteinSequence;
046    import org.biojava3.core.sequence.Strand;
047    import org.biojava3.core.sequence.compound.AminoAcidCompound;
048    import org.biojava3.core.sequence.compound.AminoAcidCompoundSet;
049    import org.biojava3.core.sequence.features.DBReferenceInfo;
050    import org.biojava3.core.sequence.features.DatabaseReferenceInterface;
051    import org.biojava3.core.sequence.features.FeaturesKeyWordInterface;
052    
053    import org.biojava3.core.sequence.storage.SequenceAsStringHelper;
054    import org.biojava3.core.sequence.template.CompoundSet;
055    import org.biojava3.core.sequence.template.ProxySequenceReader;
056    import org.biojava3.core.sequence.template.SequenceMixin;
057    import org.biojava3.core.sequence.template.SequenceView;
058    import org.biojava3.core.util.XMLHelper;
059    import org.w3c.dom.Document;
060    import org.w3c.dom.Element;
061    
062    /**
063     * 
064     * Pass in a Uniprot ID and this ProxySequenceReader when passed to a ProteinSequence will get the sequence data and other data elements
065     * associated with the ProteinSequence by Uniprot. This is an example of how to map external databases of proteins and features to the BioJava3
066     * ProteinSequence.
067     * Important to call @see setUniprotDirectoryCache to allow caching of XML files so they don't need to be reloaded each time. Does
068     * not manage cache.
069     * @param <C>
070     */
071    
072    public class UniprotProxySequenceReader<C extends Compound> implements ProxySequenceReader<C>, FeaturesKeyWordInterface, DatabaseReferenceInterface {
073    
074        private static final Logger logger = Logger.getLogger(UniprotProxySequenceReader.class.getName());
075        private static String uniprotbaseURL = "http://www.uniprot.org"; //"http://pir.uniprot.org";
076        private static String uniprotDirectoryCache = null;
077    
078    
079        private String sequence;
080        private CompoundSet<C> compoundSet;
081        private List<C> parsedCompounds = new ArrayList<C>();
082        Document uniprotDoc;
083    
084    
085        /**
086         * The uniprot id is used to retrieve the uniprot XML which is then parsed as a DOM object
087         * so we know everything about the protein. If an error occurs throw an exception. We could
088         * have a bad uniprot id or network error
089         * @param accession
090         * @param compoundSet
091         * @throws Exception
092         */
093        public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet) throws Exception {
094    
095            setCompoundSet(compoundSet);
096            uniprotDoc = this.getUniprotXML(accession);
097            String seq = this.getSequence(uniprotDoc);
098            setContents(seq);
099        }
100    
101        public void setCompoundSet(CompoundSet<C> compoundSet) {
102            this.compoundSet = compoundSet;
103        }
104    
105        /**
106         * Once the sequence is retrieved set the contents and make sure everything this is valid
107         * @param sequence
108         */
109        public void setContents(String sequence) {
110            // Horrendously inefficient - pretty much the way the old BJ did things.
111            // TODO Should be optimised.
112            this.sequence = sequence;
113            this.parsedCompounds.clear();
114            for (int i = 0; i < sequence.length();) {
115                String compoundStr = null;
116                C compound = null;
117                for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) {
118                    compoundStr = sequence.substring(i, i + compoundStrLength);
119                    compound = compoundSet.getCompoundForString(compoundStr);
120                }
121                if (compound == null) {
122                    throw new CompoundNotFoundError(compoundStr);
123                } else {
124                    i += compoundStr.length();
125                }
126                this.parsedCompounds.add(compound);
127            }
128        }
129    
130        /**
131         * The sequence length
132         * @return
133         */
134        public int getLength() {
135            return this.parsedCompounds.size();
136        }
137    
138        /**
139         *
140         * @param position
141         * @return
142         */
143        public C getCompoundAt(int position) {
144            return this.parsedCompounds.get(position - 1);
145        }
146    /**
147     *
148     * @param compound
149     * @return
150     */
151        public int getIndexOf(C compound) {
152            return this.parsedCompounds.indexOf(compound) + 1;
153        }
154    /**
155     *
156     * @param compound
157     * @return
158     */
159        public int getLastIndexOf(C compound) {
160            return this.parsedCompounds.lastIndexOf(compound) + 1;
161        }
162    /**
163     *
164     * @return
165     */
166        
167        @Override
168        public String toString() {
169            return getSequenceAsString();
170        }
171    /**
172     *
173     * @return
174     */
175        public String getSequenceAsString() {
176            return sequence;
177        }
178    /**
179     *
180     * @return
181     */
182        public List<C> getAsList() {
183            return this.parsedCompounds;
184        }
185    /**
186     *
187     * @return
188     */
189        @Override
190        public SequenceView<C> getInverse() {
191            return SequenceMixin.inverse(this);
192        }
193    /**
194     *
195     * @param bioBegin
196     * @param bioEnd
197     * @param strand
198     * @return
199     */
200        
201        public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) {
202            SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>();
203            return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand);
204        }
205    /**
206     *
207     * @param bioBegin
208     * @param bioEnd
209     * @return
210     */
211        public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) {
212            return new SequenceProxyView<C>(UniprotProxySequenceReader.this, bioBegin, bioEnd);
213        }
214    /**
215     *
216     * @return
217     */
218        public Iterator<C> iterator() {
219            return this.parsedCompounds.iterator();
220        }
221    /**
222     *
223     * @return
224     */
225        public CompoundSet<C> getCompoundSet() {
226            return compoundSet;
227        }
228    /**
229     *
230     * @return
231     */
232        
233        public AccessionID getAccession() {
234            throw new UnsupportedOperationException("Not supported yet.");
235        }
236    /**
237     *
238     * @param compounds
239     * @return
240     */
241        
242        public int countCompounds(C... compounds) {
243            throw new UnsupportedOperationException("Not supported yet.");
244        }
245    /**
246     *
247     * @param accession
248     * @return
249     * @throws Exception
250     */
251        private Document getUniprotXML(String accession) throws Exception {
252            int index = accession.lastIndexOf(".");
253            String key = accession;
254            if (index != -1) {
255                key = accession.substring(0, index);
256            }
257            StringBuilder sb = new StringBuilder();
258            File f = null;
259            if (uniprotDirectoryCache != null && uniprotDirectoryCache.length() > 0) {
260                f = new File(uniprotDirectoryCache + File.separatorChar + key + ".xml");
261                if (f.exists()) {
262                    FileReader fr = new FileReader(f);
263                    int size = (int) f.length();
264                    char[] data = new char[size];
265                    fr.read(data);
266                    fr.close();
267                    sb.append(data);
268                    index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
269                    if (index != -1) {
270                        int lastIndex = sb.indexOf(">", index);
271                        sb.replace(index, lastIndex, "");
272                    }
273                }
274    
275            }
276            if (sb.length() == 0) {
277                String uniprotURL = getUniprotbaseURL() + "/uniprot/" + key + ".xml";
278                logger.info("Loading " + uniprotURL);
279                URL uniprot = new URL(uniprotURL);
280                URLConnection uniprotConnection = uniprot.openConnection();
281                BufferedReader in = new BufferedReader(
282                        new InputStreamReader(
283                        uniprotConnection.getInputStream()));
284                String inputLine;
285    
286                while ((inputLine = in.readLine()) != null) {
287                    sb.append(inputLine);
288                }
289                in.close();
290                index = sb.indexOf("xmlns="); //strip out name space stuff to make it easier on xpath
291                if (index != -1) {
292                    int lastIndex = sb.indexOf(">", index);
293                    sb.replace(index, lastIndex, "");
294                }
295                if (f != null) {
296                    FileWriter fw = new FileWriter(f);
297                    fw.write(sb.toString());
298                    fw.close();
299                }
300            }
301    
302            logger.info("Load complete");
303            try {
304                //       System.out.println(sb.toString());
305                Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(sb.toString().getBytes()));
306                return document;
307            } catch (Exception e) {
308                System.out.println("Exception on xml parse of:" + sb.toString());
309            }
310            return null;
311        }
312    /**
313     *
314     * @param uniprotDoc
315     * @return
316     * @throws Exception
317     */
318        private String getSequence(Document uniprotDoc) throws Exception {
319            Element uniprotElement = uniprotDoc.getDocumentElement();
320            Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
321            Element sequenceElement = XMLHelper.selectSingleElement(entryElement, "sequence");
322    
323            String seqdata = sequenceElement.getTextContent();
324    
325            return seqdata;
326        }
327    
328        /**
329         * The current unirpot URL to deal with caching issues. www.uniprot.org is loaded balanced
330         * but you can access pir.uniprot.org directly.
331         * @return the uniprotbaseURL
332         */
333        public static String getUniprotbaseURL() {
334            return uniprotbaseURL;
335        }
336    
337        /**
338         * @param aUniprotbaseURL the uniprotbaseURL to set
339         */
340        public static void setUniprotbaseURL(String aUniprotbaseURL) {
341            uniprotbaseURL = aUniprotbaseURL;
342        }
343    
344         /**
345          * Local directory cache of XML that can be downloaded
346         * @return the uniprotDirectoryCache
347         */
348        public static String getUniprotDirectoryCache() {
349            return uniprotDirectoryCache;
350        }
351    
352        /**
353         * @param aUniprotDirectoryCache the uniprotDirectoryCache to set
354         */
355        public static void setUniprotDirectoryCache(String aUniprotDirectoryCache) {
356            File f = new File(aUniprotDirectoryCache);
357            if(f.exists() == false){
358                f.mkdirs();
359            }
360            uniprotDirectoryCache = aUniprotDirectoryCache;
361        }
362    
363        public static void main(String[] args) {
364    
365            try {
366                UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>("YA745_GIBZE", AminoAcidCompoundSet.getAminoAcidCompoundSet());
367                ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence);
368    
369                System.out.println("Sequence=" + proteinSequence.getSequenceAsString());
370            } catch (Exception e) {
371                e.printStackTrace();
372            }
373    
374        }
375    
376        /**
377         * Get the gene name associated with this sequence. 
378         * @return
379         * @throws Exception
380         */
381    
382        public String getGeneName() throws Exception{
383            if(uniprotDoc == null)
384                return "";
385            Element uniprotElement = uniprotDoc.getDocumentElement();
386            Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
387            Element geneElement = XMLHelper.selectSingleElement(entryElement, "gene");
388            if(geneElement == null)
389                return "";
390            Element nameElement = XMLHelper.selectSingleElement(geneElement, "name");
391            if(nameElement == null)
392                return "";
393            return nameElement.getTextContent();
394        }
395    
396        /**
397         * Get the organism name assigned to this sequence
398         * @return
399         * @throws Exception
400         */
401    
402        public String getOrganismName() throws Exception{
403            if(uniprotDoc == null)
404                return "";
405            Element uniprotElement = uniprotDoc.getDocumentElement();
406            Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
407            Element organismElement = XMLHelper.selectSingleElement(entryElement, "organism");
408            if(organismElement == null)
409                return "";
410            Element nameElement = XMLHelper.selectSingleElement(organismElement, "name");
411            if(nameElement == null)
412                return "";
413            return nameElement.getTextContent();
414        }
415    
416        /**
417         * Pull uniprot key words which is a mixed bag of words associated with this sequence
418         * @return
419         * @throws Exception
420         */
421        
422        public ArrayList<String> getKeyWords() throws Exception {
423            ArrayList<String> keyWordsList = new ArrayList<String>();
424            if (uniprotDoc == null) {
425                return keyWordsList;
426            }
427            Element uniprotElement = uniprotDoc.getDocumentElement();
428            Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
429            ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "keyword");
430            for (Element element : keyWordElementList) {
431                keyWordsList.add(element.getTextContent());
432            }
433    
434            return keyWordsList;
435        }
436    
437        /**
438         * The Uniprot mappings to other database identifiers for this sequence
439         * @return
440         * @throws Exception
441         */
442        
443        public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() throws Exception {
444            LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferencesHashMap = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
445            if (uniprotDoc == null) {
446                return databaseReferencesHashMap;
447            }
448    
449            Element uniprotElement = uniprotDoc.getDocumentElement();
450            Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
451            ArrayList<Element> dbreferenceElementList = XMLHelper.selectElements(entryElement, "dbReference");
452            for (Element element : dbreferenceElementList) {
453                String type = element.getAttribute("type");
454                String id = element.getAttribute("id");
455                ArrayList<DBReferenceInfo> idlist = databaseReferencesHashMap.get(type);
456                if (idlist == null) {
457                    idlist = new ArrayList<DBReferenceInfo>();
458                    databaseReferencesHashMap.put(type, idlist);
459                }
460                DBReferenceInfo dbreferenceInfo = new DBReferenceInfo(type, id);
461                ArrayList<Element> propertyElementList = XMLHelper.selectElements(element, "property");
462                for (Element propertyElement : propertyElementList) {
463                    String propertyType = propertyElement.getAttribute("type");
464                    String propertyValue = propertyElement.getAttribute("value");
465                    dbreferenceInfo.addProperty(propertyType, propertyValue);
466                }
467    
468                idlist.add(dbreferenceInfo);
469            }
470    
471    
472            return databaseReferencesHashMap;
473        }
474    }