001 /*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence. This should
006 * be distributed with the code. If you do not have a copy,
007 * see:
008 *
009 * http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors. These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 * http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022 package org.biojava3.core.sequence.location;
023
024 import java.io.IOException;
025 import java.io.Reader;
026 import java.io.StringReader;
027 import java.util.ArrayList;
028 import java.util.List;
029 import java.util.regex.Matcher;
030 import java.util.regex.Pattern;
031 import org.biojava3.core.exceptions.ParserException;
032 import org.biojava3.core.sequence.AccessionID;
033 import org.biojava3.core.sequence.DataSource;
034 import org.biojava3.core.sequence.Strand;
035 import org.biojava3.core.sequence.location.template.Location;
036 import org.biojava3.core.sequence.location.template.Point;
037
038 /**
039 * Parser for working with INSDC style locations. This class supports the
040 * full range of location types generated by Genbank, INSDC and ENA.
041 *
042 * @author ayates
043 */
044 public class InsdcParser {
045
046 private final DataSource dataSource;
047
048 public InsdcParser() {
049 this(DataSource.ENA);
050 }
051
052 public InsdcParser(DataSource dataSource) {
053 this.dataSource = dataSource;
054 }
055
056 public DataSource getDataSource() {
057 return dataSource;
058 }
059
060 /**
061 * Parses a location of the form Accession:1
062 */
063 private static final Pattern SINGLE_LOCATION = Pattern.compile(
064 "\\A ([A-Za-z.0-9]*?) :? ([<>]?) (\\d+) \\Z", Pattern.COMMENTS);
065
066 /**
067 * Parses a location of the form Accession:1..4 (also supports the ^
068 * format and undefined locations)
069 */
070 private static final Pattern RANGE_LOCATION = Pattern.compile(
071 "\\A ([A-Za-z.0-9]*?) :? ([<>]?) (\\d+) ([.^]+) ([<>]?) (\\d+) \\Z", Pattern.COMMENTS);
072
073 /**
074 * Main method for parsing a location from a String instance
075 *
076 * @param locationString Represents a logical location
077 * @return The parsed location
078 * @throws ParserException thrown in the event of any error during parsing
079 */
080 public Location parse(String locationString) throws ParserException {
081 try {
082 return parse(new StringReader(locationString));
083 }
084 catch(IOException e) {
085 throw new ParserException("Cannot parse the given location '"+
086 locationString+"'", e);
087 }
088 }
089
090 /**
091 * Reader based version of the parse methods.
092 *
093 * @param reader The source of the data; assumes that end of the reader
094 * stream is the end of the location string to parse
095 * @return The parsed location
096 * @throws IOException Thrown with any reader error
097 * @throws ParserException Thrown with any error with parsing locations
098 */
099 public Location parse(Reader reader) throws IOException, ParserException {
100 List<Location> out = parse(reader, Strand.POSITIVE);
101 if(out.size() > 1) {
102 throw new ParserException("Too many locations parsed "+out);
103 }
104 else if(out.isEmpty()) {
105 throw new ParserException("No locations parsed");
106 }
107 return out.get(0);
108 }
109
110 protected List<Location> parse(Reader reader, Strand strand) throws IOException, ParserException {
111 StringBuilder sb = new StringBuilder();
112 String typeOfJoin = null;
113 List<Location> locationList = new ArrayList<Location>();
114
115 int i = -1;
116 while( (i = reader.read()) != -1 ) {
117 char c = (char)i;
118 switch(c) {
119 case '(':
120 if(isComplement(sb)) {
121 locationList.addAll(parse(reader, strand.getReverse()));
122 }
123 else {
124 typeOfJoin = sb.toString();
125 List<Location> subs = parse(reader, strand);
126 locationList.add(LocationHelper.location(subs, typeOfJoin));
127 }
128 clearStringBuilder(sb);
129 break;
130 case ',':
131 case ')':
132 if(sb.length() > 0) {
133 locationList.add(parseLocation(sb.toString(), strand));
134 }
135 if( c == ')') {
136 return locationList;
137 }
138 clearStringBuilder(sb);
139 break;
140 default:
141 if(!Character.isWhitespace(c)) {
142 sb.append(c);
143 }
144 break;
145 }
146 }
147
148 if(sb.length() != 0) {
149 locationList.add(parseLocation(sb.toString(), strand));
150 clearStringBuilder(sb);
151 }
152
153 return locationList;
154 }
155
156 private boolean isComplement(StringBuilder sb) {
157 return sb.toString().equals("complement");
158 }
159
160 private void clearStringBuilder(StringBuilder sb) {
161 sb.delete(0, sb.length());
162 }
163
164 protected Location parseLocation(String location, Strand strand) {
165 Matcher singleLoc = SINGLE_LOCATION.matcher(location);
166 Matcher rangeLoc = RANGE_LOCATION.matcher(location);
167 if(rangeLoc.matches()) {
168 return parseRange(rangeLoc, strand);
169 }
170 else if(singleLoc.matches()) {
171 return parseSingle(singleLoc, strand);
172 }
173 else {
174 throw new ParserException("Location string does not match "
175 + "a single or range location");
176 }
177 }
178
179 protected Location parseSingle(Matcher matcher, Strand strand) {
180 String accession = matcher.group(1);
181 String uncertain = matcher.group(2);
182 String location = matcher.group(3);
183 Point p = generatePoint(location, uncertain);
184 if (accession == null || "".equals(accession)) {
185 return new SimpleLocation(p, p, strand);
186 }
187 else {
188 return new SimpleLocation(p, p, strand, getAccession(accession));
189 }
190 }
191
192 protected Location parseRange(Matcher matcher, Strand strand) {
193 String accession = matcher.group(1);
194 String type = matcher.group(4);
195 Point start = generatePoint(
196 matcher.group(3),
197 matcher.group(2));
198 Point end = generatePoint(
199 matcher.group(6),
200 matcher.group(5));
201 boolean betweenBases = "^".equals(type);
202 if (accession == null || "".equals(accession)) {
203 return new SimpleLocation(start, end, strand, false, betweenBases);
204 }
205 else {
206 return new SimpleLocation(start, end, strand, betweenBases, getAccession(accession));
207 }
208 }
209
210 protected Point generatePoint(String locationString, String uncertainString) {
211 int location = Integer.valueOf(locationString);
212 boolean unknown = false;
213 boolean uncertain = (!"".equals(uncertainString));
214 return new SimplePoint(location, unknown, uncertain);
215 }
216
217 protected AccessionID getAccession(String accession) {
218 return new AccessionID(accession, getDataSource());
219 }
220 }