OpenBio*: Main.java

File Main.java, 4.7 KB (added by markjschreiber, 16 years ago)

Roundtrip program for biojava

Line 
1package roundtrip;
2
3import java.io.BufferedReader;
4import java.io.FileReader;
5import java.io.IOException;
6import org.biojava.bio.seq.io.ParseException;
7import org.biojava.bio.seq.io.SymbolTokenization;
8import org.biojava.bio.symbol.AlphabetManager;
9import org.biojava.bio.symbol.FiniteAlphabet;
10import org.biojavax.Namespace;
11import org.biojavax.bio.seq.RichSequence;
12import org.biojavax.bio.seq.io.EMBLFormat;
13import org.biojavax.bio.seq.io.EMBLxmlFormat;
14import org.biojavax.bio.seq.io.FastaFormat;
15import org.biojavax.bio.seq.io.GenbankFormat;
16import org.biojavax.bio.seq.io.INSDseqFormat;
17import org.biojavax.bio.seq.io.RichSequenceBuilderFactory;
18import org.biojavax.bio.seq.io.RichSequenceFormat;
19import org.biojavax.bio.seq.io.RichStreamReader;
20import org.biojavax.bio.seq.io.RichStreamWriter;
21import org.biojavax.bio.seq.io.UniProtFormat;
22import org.biojavax.bio.seq.io.UniProtXMLFormat;
23import org.biojavax.bio.taxa.NCBITaxon;
24import org.biojavax.bio.taxa.io.NCBITaxonomyLoader;
25import org.biojavax.bio.taxa.io.SimpleNCBITaxonomyLoader;
26
27/**
28 * This program will round trip sequence formats
29 * @author Mark
30 */
31public class Main {
32
33    /**
34     * Attempts to find a format for a name String such as "genbank" or for a
35     * fully qualified string like org.biojavax.bio.seq.io.UniProtFormat
36     * @return the matching <code>RichSequenceFormat</code>
37     * @param name the name of the format, case insensitive except for qualified class names
38     * @throws java.lang.IllegalAccessException If java cannot reflectively access the named format.
39     * Only applies to fully qualified class names.
40     * @throws java.lang.ClassNotFoundException If a format can not be found for the name.
41     * @throws java.lang.InstantiationException If the found object cannot be created (only applies
42     * to fully qualified class names).
43     */
44    public static RichSequenceFormat formatForName(String name)
45            throws ClassNotFoundException, InstantiationException, IllegalAccessException {
46        //determine the format to use
47        RichSequenceFormat format;
48        if (name.equalsIgnoreCase("fasta")) {
49            format = new FastaFormat();
50        } else if (name.equalsIgnoreCase("genbank")) {
51            format = new GenbankFormat();
52        } else if (name.equalsIgnoreCase("uniprot")) {
53            format = new UniProtFormat();
54        } else if (name.equalsIgnoreCase("embl")) {
55            format = new EMBLFormat();
56        } else if (name.equalsIgnoreCase("INSDseq")) {
57            format = new INSDseqFormat();
58        } else if (name.equalsIgnoreCase("EMBLxml")) {
59            format = new EMBLxmlFormat();
60        } else if (name.equalsIgnoreCase("UniprotXML")){
61            format = new UniProtXMLFormat();
62        } else {
63            Class formatClass = Class.forName(name);
64            format = (RichSequenceFormat) formatClass.newInstance();
65        }
66        return format;
67    }
68
69    public static void loadNCBITaxon() throws IOException, ParseException{
70        NCBITaxonomyLoader l = new SimpleNCBITaxonomyLoader();
71        BufferedReader nodes = new BufferedReader(new FileReader("nodes.dmp"));
72        BufferedReader names = new BufferedReader(new FileReader("names.dmp"));
73
74        NCBITaxon t;
75        while ((t = l.readNode(nodes)) != null) {}  // read all the nodes first
76        while ((t = l.readName(names)) != null) {}  // then read all the names
77    }
78
79    /**
80     * @param args the command line arguments
81     * args[0] the input file name
82     * args[1] the input format name or fully qualified classname (eg fasta, or
83     * org.biojavax.bio.seq.io.FastaFormat)
84     * args[2] the ouput format name (see above)
85     * args[3] the alphabet (commonly DNA or Protein)
86     * args[4] the namespace (something like gb)
87     */
88    public static void main(String[] args) throws Exception {
89        BufferedReader br = new BufferedReader(new FileReader(args[0]));
90        RichSequenceFormat inFormat = formatForName(args[1]);
91        RichSequenceFormat outFormat = formatForName(args[2]);
92        FiniteAlphabet alpha = (FiniteAlphabet) AlphabetManager.alphabetForName(args[3]);
93        Namespace ns = null;
94        SymbolTokenization toke = alpha.getTokenization("default");
95       
96        if(! (inFormat.getClass().equals(formatForName("fasta").getClass())
97                || outFormat.getClass().equals(formatForName("fasta").getClass()))){
98            System.out.println("Loading NCBI taxonomy");
99            loadNCBITaxon();
100        }
101
102        RichStreamReader sr = new RichStreamReader(
103                br, inFormat, toke,
104                RichSequenceBuilderFactory.THRESHOLD,
105                null);
106       
107        RichStreamWriter sw = new RichStreamWriter(System.out, outFormat);
108        sw.writeStream(sr, ns);
109    }
110}