001    /*
002     *                    BioJava development code
003     *
004     * This code may be freely distributed and modified under the
005     * terms of the GNU Lesser General Public Licence.  This should
006     * be distributed with the code.  If you do not have a copy,
007     * see:
008     *
009     *      http://www.gnu.org/copyleft/lesser.html
010     *
011     * Copyright for this code is held jointly by the individual
012     * authors.  These should be listed in @author doc comments.
013     *
014     * For more information on the BioJava project and its aims,
015     * or to join the biojava-l mailing list, visit the home page
016     * at:
017     *
018     *      http://www.biojava.org/
019     *
020     * Created on 01-21-2010
021     */
022    package org.biojava3.core.sequence.location;
023    
024    import java.io.IOException;
025    import java.io.Reader;
026    import java.io.StringReader;
027    import java.util.ArrayList;
028    import java.util.List;
029    import java.util.regex.Matcher;
030    import java.util.regex.Pattern;
031    import org.biojava3.core.exceptions.ParserException;
032    import org.biojava3.core.sequence.AccessionID;
033    import org.biojava3.core.sequence.DataSource;
034    import org.biojava3.core.sequence.Strand;
035    import org.biojava3.core.sequence.location.template.Location;
036    import org.biojava3.core.sequence.location.template.Point;
037    
038    /**
039     * Parser for working with INSDC style locations. This class supports the
040     * full range of location types generated by Genbank, INSDC and ENA.
041     *
042     * @author ayates
043     */
044    public class InsdcParser {
045    
046        private final DataSource dataSource;
047    
048        public InsdcParser() {
049            this(DataSource.ENA);
050        }
051    
052        public InsdcParser(DataSource dataSource) {
053            this.dataSource = dataSource;
054        }
055    
056        public DataSource getDataSource() {
057            return dataSource;
058        }
059    
060        /**
061         * Parses a location of the form Accession:1
062         */
063        private static final Pattern SINGLE_LOCATION = Pattern.compile(
064                "\\A ([A-Za-z.0-9]*?) :? ([<>]?) (\\d+) \\Z", Pattern.COMMENTS);
065    
066        /**
067         * Parses a location of the form Accession:1..4 (also supports the ^
068         * format and undefined locations)
069         */
070        private static final Pattern RANGE_LOCATION = Pattern.compile(
071                "\\A ([A-Za-z.0-9]*?) :? ([<>]?) (\\d+) ([.^]+) ([<>]?) (\\d+) \\Z", Pattern.COMMENTS);
072    
073        /**
074         * Main method for parsing a location from a String instance
075         *
076         * @param locationString Represents a logical location
077         * @return The parsed location
078         * @throws ParserException thrown in the event of any error during parsing
079         */
080        public Location parse(String locationString) throws ParserException {
081            try {
082                return parse(new StringReader(locationString));
083            }
084            catch(IOException e) {
085                throw new ParserException("Cannot parse the given location '"+
086                        locationString+"'", e);
087            }
088        }
089    
090        /**
091         * Reader based version of the parse methods.
092         *
093         * @param reader The source of the data; assumes that end of the reader
094         * stream is the end of the location string to parse
095         * @return The parsed location
096         * @throws IOException Thrown with any reader error
097         * @throws ParserException Thrown with any error with parsing locations
098         */
099        public Location parse(Reader reader) throws IOException, ParserException {
100            List<Location> out = parse(reader, Strand.POSITIVE);
101            if(out.size() > 1) {
102                throw new ParserException("Too many locations parsed "+out);
103            }
104            else if(out.isEmpty()) {
105                throw new ParserException("No locations parsed");
106            }
107            return out.get(0);
108        }
109    
110        protected List<Location> parse(Reader reader, Strand strand) throws IOException, ParserException {
111            StringBuilder sb = new StringBuilder();
112            String typeOfJoin = null;
113            List<Location> locationList = new ArrayList<Location>();
114    
115            int i = -1;
116            while( (i = reader.read()) != -1 ) {
117                char c = (char)i;
118                switch(c) {
119                    case '(':
120                        if(isComplement(sb)) {
121                            locationList.addAll(parse(reader, strand.getReverse()));
122                        }
123                        else {
124                            typeOfJoin = sb.toString();
125                            List<Location> subs = parse(reader, strand);
126                            locationList.add(LocationHelper.location(subs, typeOfJoin));
127                        }
128                        clearStringBuilder(sb);
129                        break;
130                    case ',':
131                    case ')':
132                        if(sb.length() > 0) {
133                            locationList.add(parseLocation(sb.toString(), strand));
134                        }
135                        if( c == ')') {
136                            return locationList;
137                        }
138                        clearStringBuilder(sb);
139                        break;
140                    default:
141                        if(!Character.isWhitespace(c)) {
142                            sb.append(c);
143                        }
144                        break;
145                }
146            }
147    
148            if(sb.length() != 0) {
149                 locationList.add(parseLocation(sb.toString(), strand));
150                 clearStringBuilder(sb);
151            }
152    
153            return locationList;
154        }
155    
156        private boolean isComplement(StringBuilder sb) {
157            return sb.toString().equals("complement");
158        }
159    
160        private void clearStringBuilder(StringBuilder sb) {
161            sb.delete(0, sb.length());
162        }
163    
164        protected Location parseLocation(String location, Strand strand) {
165            Matcher singleLoc = SINGLE_LOCATION.matcher(location);
166            Matcher rangeLoc = RANGE_LOCATION.matcher(location);
167            if(rangeLoc.matches()) {
168                return parseRange(rangeLoc, strand);
169            }
170            else if(singleLoc.matches()) {
171                return parseSingle(singleLoc, strand);
172            }
173            else {
174                throw new ParserException("Location string does not match "
175                        + "a single or range location");
176            }
177        }
178    
179        protected Location parseSingle(Matcher matcher, Strand strand) {
180            String accession = matcher.group(1);
181            String uncertain = matcher.group(2);
182            String location = matcher.group(3);
183            Point p = generatePoint(location, uncertain);
184            if (accession == null || "".equals(accession)) {
185                return new SimpleLocation(p, p, strand);
186            }
187            else {
188                return new SimpleLocation(p, p, strand, getAccession(accession));
189            }
190        }
191    
192        protected Location parseRange(Matcher matcher, Strand strand) {
193            String accession = matcher.group(1);
194            String type = matcher.group(4);
195            Point start = generatePoint(
196                    matcher.group(3),
197                    matcher.group(2));
198            Point end = generatePoint(
199                    matcher.group(6),
200                    matcher.group(5));
201            boolean betweenBases = "^".equals(type);
202            if (accession == null || "".equals(accession)) {
203                return new SimpleLocation(start, end, strand, false, betweenBases);
204            }
205            else {
206                return new SimpleLocation(start, end, strand, betweenBases, getAccession(accession));
207            }
208        }
209    
210        protected Point generatePoint(String locationString, String uncertainString) {
211            int location = Integer.valueOf(locationString);
212            boolean unknown = false;
213            boolean uncertain = (!"".equals(uncertainString));
214            return new SimplePoint(location, unknown, uncertain);
215        }
216    
217        protected AccessionID getAccession(String accession) {
218            return new AccessionID(accession, getDataSource());
219        }
220    }