001 /*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence. This should
006 * be distributed with the code. If you do not have a copy,
007 * see:
008 *
009 * http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors. These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 * http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022 package org.biojava3.core.sequence.io;
023
024 import java.io.InputStream;
025 import java.util.ArrayList;
026 import java.util.Arrays;
027 import java.util.HashMap;
028 import java.util.List;
029 import java.util.Map;
030
031 import org.biojava3.core.exceptions.ParserException;
032 import org.biojava3.core.sequence.compound.AminoAcidCompound;
033 import org.biojava3.core.sequence.compound.NucleotideCompound;
034 import org.biojava3.core.sequence.io.util.ClasspathResource;
035 import org.biojava3.core.sequence.io.util.IOUtils;
036 import org.biojava3.core.sequence.template.AbstractCompoundSet;
037 import org.biojava3.core.sequence.template.CompoundSet;
038 import org.biojava3.core.sequence.transcription.Table;
039
040
041 /**
042 * Available translations
043 *
044 * <ul>
045 * <li>1 - UNIVERSAL</li>
046 * <li>2 - VERTEBRATE_MITOCHONDRIAL</li>
047 * <li>3 - YEAST_MITOCHONDRIAL</li>
048 * <li>4 - MOLD_MITOCHONDRIAL</li>
049 * <li>5 - INVERTEBRATE_MITOCHONDRIAL</li>
050 * <li>6 - CILIATE_NUCLEAR</li>
051 * <li>9 - ECHINODERM_MITOCHONDRIAL</li>
052 * <li>10 - EUPLOTID_NUCLEAR</li>
053 * <li>11 - BACTERIAL</li>
054 * <li>12 - ALTERNATIVE_YEAST_NUCLEAR</li>
055 * <li>13 - ASCIDIAN_MITOCHONDRIAL</li>
056 * <li>14 - FLATWORM_MITOCHONDRIAL</li>
057 * <li>15 - BLEPHARISMA_MACRONUCLEAR</li>
058 * <li>16 - 2CHLOROPHYCEAN_MITOCHONDRIAL</li>
059 * <li>21 - TREMATODE_MITOCHONDRIAL</li>
060 * <li>23 - SCENEDESMUS_MITOCHONDRIAL</li>
061 * </ul>
062 *
063 * Taken from <a
064 * href="http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c"
065 * >NCBI</a> with slight modification and put into the classpath resource.
066 *
067 * Takes in an ID, name, amino acid string and the locations of amino acids
068 * which acts as start codons in the translation table. You can give the 3 codon
069 * position strings that correspond to the amino acid string or if you are using
070 * the default IUPAC codes you can use the hardcoded ones which are consistent
071 * amongst all <a
072 * href="http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c"> codon
073 * tables</a>.
074 *
075 * The generated {@link IUPACTable} objects do not parse the data further until
076 * requested so if you do not use a translation table your only penalty is the
077 * loading of the IUPAC data from the classpath.
078 *
079 * @author Andy Yates
080 */
081 public class IUPACParser {
082
083 private static class IOD {
084 public static final IUPACParser INSTANCE = new IUPACParser();
085 }
086
087 public static IUPACParser getInstance() {
088 return IOD.INSTANCE;
089 }
090
091 public static final String IUPAC_LOCATION = "org/biojava3/core/sequence/iupac.txt";
092
093 private InputStream is;
094 private List<IUPACTable> tables;
095 private Map<String, IUPACTable> nameLookup;
096 private Map<Integer, IUPACTable> idLookup;
097
098 /**
099 * Default version and uses the classpath based IUPAC table
100 */
101 public IUPACParser() {
102 //use the preCache version to make sure we don't keep a IO handle open
103 is = new ClasspathResource(IUPAC_LOCATION, true).getInputStream();
104 }
105
106 /**
107 * Allows you to specify a different IUPAC table.
108 */
109 public IUPACParser(InputStream is) {
110 this.is = is;
111 }
112
113 /**
114 * Returns a list of all available IUPAC tables
115 */
116 public List<IUPACTable> getTables() {
117 if (tables == null) {
118 tables = parseTables();
119 }
120 return tables;
121 }
122
123 /**
124 * Returns a table by its name
125 */
126 public IUPACTable getTable(String name) {
127 populateLookups();
128 return nameLookup.get(name);
129 }
130
131 /**
132 * Returns a table by its identifier i.e. 1 means universal codon tables
133 */
134 public IUPACTable getTable(Integer id) {
135 populateLookups();
136 return idLookup.get(id);
137 }
138
139 private void populateLookups() {
140 if(nameLookup == null) {
141 nameLookup = new HashMap<String, IUPACTable>();
142 idLookup = new HashMap<Integer, IUPACTable>();
143 for(IUPACTable t: getTables()) {
144 nameLookup.put(t.getName(), t);
145 idLookup.put(t.getId(), t);
146 }
147 }
148 }
149
150 private List<IUPACTable> parseTables() {
151 List<IUPACTable> localTables = new ArrayList<IUPACTable>();
152 List<String> lines = IOUtils.getList(is);
153 Integer id = null;
154 String name, aa, starts, baseone, basetwo, basethree;
155 name = aa = starts = baseone = basetwo = basethree = null;
156 for (String line : lines) {
157 if (line.equalsIgnoreCase("//")) {
158 localTables.add(new IUPACTable(name, id, aa, starts, baseone, basetwo,
159 basethree));
160 name = aa = starts = baseone = basetwo = basethree = null;
161 id = null;
162 }
163 else {
164 String[] keyValue = line.split("\\s*=\\s*");
165 if (keyValue[0].equals("AAs")) {
166 aa = keyValue[1];
167 }
168 else if (keyValue[0].equals("Starts")) {
169 starts = keyValue[1];
170 }
171 else if (keyValue[0].equals("Base1")) {
172 baseone = keyValue[1];
173 }
174 else if (keyValue[0].equals("Base2")) {
175 basetwo = keyValue[1];
176 }
177 else if (keyValue[0].equals("Base3")) {
178 basethree = keyValue[1];
179 }
180 else {
181 name = keyValue[0];
182 id = Integer.parseInt(keyValue[1]);
183 }
184 }
185 }
186
187 return localTables;
188 }
189
190 /**
191 * Holds the concept of a codon table from the IUPAC format
192 *
193 * @author Andy Yates
194 */
195 public static class IUPACTable implements Table {
196
197 private final Integer id;
198 private final String name;
199 private final String aminoAcidString;
200 private final String startCodons;
201 private final String baseOne;
202 private final String baseTwo;
203 private final String baseThree;
204
205 private final List<Codon> codons = new ArrayList<Codon>();
206 private CompoundSet<Codon> compounds = null;
207
208 public IUPACTable(String name, int id, String aminoAcidString,
209 String startCodons, String baseOne, String baseTwo, String baseThree) {
210 this.aminoAcidString = aminoAcidString;
211 this.startCodons = startCodons;
212 this.name = name;
213 this.id = id;
214 this.baseOne = baseOne;
215 this.baseTwo = baseTwo;
216 this.baseThree = baseThree;
217 }
218
219 /**
220 * Constructor which uses the basic IUPAC codon table format. Useful
221 * if you need to specify your own IUPAC table with minimal
222 * definitions from your side.
223 */
224 public IUPACTable(String name, Integer id, String aminoAcidString,
225 String startCodons) {
226 this(name, id, aminoAcidString, startCodons,
227 "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
228 "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
229 "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG");
230 }
231
232 public Integer getId() {
233 return id;
234 }
235
236 public String getName() {
237 return name;
238 }
239
240 /**
241 * Returns true if the given compound was a start codon in this
242 * codon table. This will report true if the compound could ever have
243 * been a start codon.
244 *
245 * @throws IllegalStateException Thrown if
246 * {@link #getCodons(CompoundSet, CompoundSet)} was not called first.
247 */
248 @Override
249 public boolean isStart(AminoAcidCompound compound) throws IllegalStateException {
250 if(this.codons.isEmpty()) {
251 throw new IllegalStateException("Codons are empty; please request getCodons() fist before asking this");
252 }
253 for(Codon codon: codons) {
254 //Only check if the codon was a start codon and then ask if the compound was encoded by it
255 if(codon.isStart()) {
256 if(codon.getAminoAcid().equalsIgnoreCase(compound)) {
257 return true;
258 }
259 }
260 }
261 return false;
262 }
263
264 /**
265 * Returns a list of codons where the source and target compounds
266 * are the same as those given by the parameters.
267 *
268 * @param nucleotides The nucleotide set to use when building BioJava
269 * representations of codons
270 * @param aminoAcids The target amino acid compounds objects
271 */
272 @Override
273 public List<Codon> getCodons(CompoundSet<NucleotideCompound> nucelotides,
274 CompoundSet<AminoAcidCompound> aminoAcids) {
275
276 if (this.codons.isEmpty()) {
277 List<String> aminoAcidStrings = aminoAcids();
278 List<String> startCodonStrings = startCodons();
279 List<List<String>> codonStrings = codonStrings();
280
281 for (int i = 0; i < aminoAcidStrings.size(); i++) {
282
283 List<String> codonString = codonStrings.get(i);
284 NucleotideCompound one = getCompound(codonString, 0, nucelotides);
285 NucleotideCompound two = getCompound(codonString, 1, nucelotides);
286 NucleotideCompound three = getCompound(codonString, 2, nucelotides);
287 boolean start = ("M".equals(startCodonStrings.get(i)));
288 boolean stop = ("*".equals(aminoAcidStrings.get(i)));
289 AminoAcidCompound aminoAcid = aminoAcids
290 .getCompoundForString(aminoAcidStrings.get(i));
291 codons.add(new Codon(new CaseInsensitiveTriplet(one, two, three), aminoAcid, start, stop));
292 }
293 }
294
295 return codons;
296 }
297
298 private NucleotideCompound getCompound(List<String> compounds,
299 int position, CompoundSet<NucleotideCompound> nucelotides) {
300 String compound = compounds.get(position);
301 NucleotideCompound returnCompound = nucelotides
302 .getCompoundForString(compound);
303 if (returnCompound == null) {
304 if ("T".equalsIgnoreCase(compound)) {
305 returnCompound = nucelotides.getCompoundForString("U");
306 }
307 else {
308 throw new ParserException("Cannot find a compound for string "
309 + compound);
310 }
311 }
312 return returnCompound;
313 }
314
315 /**
316 * Returns the compound set of codons
317 */
318 public CompoundSet<Codon> getCodonCompoundSet(
319 final CompoundSet<NucleotideCompound> rnaCompounds,
320 final CompoundSet<AminoAcidCompound> aminoAcidCompounds) {
321 if (compounds == null) {
322 compounds = new AbstractCompoundSet<Codon>() {
323 {
324 for (Codon c : getCodons(rnaCompounds, aminoAcidCompounds)) {
325 addCompound(c);
326 }
327 }
328 };
329 }
330 return compounds;
331 }
332
333 private List<List<String>> codonStrings() {
334 List<List<String>> codons = new ArrayList<List<String>>();
335 for (int i = 0; i < baseOne.length(); i++) {
336 List<String> codon = Arrays.asList(Character
337 .toString(baseOne.charAt(i)),
338 Character.toString(baseTwo.charAt(i)), Character.toString(baseThree
339 .charAt(i)));
340 codons.add(codon);
341 }
342 return codons;
343 }
344
345 private List<String> aminoAcids() {
346 return split(aminoAcidString);
347 }
348
349 private List<String> startCodons() {
350 return split(startCodons);
351 }
352
353 private List<String> split(String string) {
354 List<String> split = new ArrayList<String>();
355 for (int i = 0; i < string.length(); i++) {
356 split.add(Character.toString(string.charAt(i)));
357 }
358 return split;
359 }
360 }
361 }