1 | package roundtrip; |
---|
2 | |
---|
3 | import java.io.BufferedReader; |
---|
4 | import java.io.FileReader; |
---|
5 | import java.io.IOException; |
---|
6 | import org.biojava.bio.seq.io.ParseException; |
---|
7 | import org.biojava.bio.seq.io.SymbolTokenization; |
---|
8 | import org.biojava.bio.symbol.AlphabetManager; |
---|
9 | import org.biojava.bio.symbol.FiniteAlphabet; |
---|
10 | import org.biojavax.Namespace; |
---|
11 | import org.biojavax.bio.seq.RichSequence; |
---|
12 | import org.biojavax.bio.seq.io.EMBLFormat; |
---|
13 | import org.biojavax.bio.seq.io.EMBLxmlFormat; |
---|
14 | import org.biojavax.bio.seq.io.FastaFormat; |
---|
15 | import org.biojavax.bio.seq.io.GenbankFormat; |
---|
16 | import org.biojavax.bio.seq.io.INSDseqFormat; |
---|
17 | import org.biojavax.bio.seq.io.RichSequenceBuilderFactory; |
---|
18 | import org.biojavax.bio.seq.io.RichSequenceFormat; |
---|
19 | import org.biojavax.bio.seq.io.RichStreamReader; |
---|
20 | import org.biojavax.bio.seq.io.RichStreamWriter; |
---|
21 | import org.biojavax.bio.seq.io.UniProtFormat; |
---|
22 | import org.biojavax.bio.seq.io.UniProtXMLFormat; |
---|
23 | import org.biojavax.bio.taxa.NCBITaxon; |
---|
24 | import org.biojavax.bio.taxa.io.NCBITaxonomyLoader; |
---|
25 | import org.biojavax.bio.taxa.io.SimpleNCBITaxonomyLoader; |
---|
26 | |
---|
27 | /** |
---|
28 | * This program will round trip sequence formats |
---|
29 | * @author Mark |
---|
30 | */ |
---|
31 | public class Main { |
---|
32 | |
---|
33 | /** |
---|
34 | * Attempts to find a format for a name String such as "genbank" or for a |
---|
35 | * fully qualified string like org.biojavax.bio.seq.io.UniProtFormat |
---|
36 | * @return the matching <code>RichSequenceFormat</code> |
---|
37 | * @param name the name of the format, case insensitive except for qualified class names |
---|
38 | * @throws java.lang.IllegalAccessException If java cannot reflectively access the named format. |
---|
39 | * Only applies to fully qualified class names. |
---|
40 | * @throws java.lang.ClassNotFoundException If a format can not be found for the name. |
---|
41 | * @throws java.lang.InstantiationException If the found object cannot be created (only applies |
---|
42 | * to fully qualified class names). |
---|
43 | */ |
---|
44 | public static RichSequenceFormat formatForName(String name) |
---|
45 | throws ClassNotFoundException, InstantiationException, IllegalAccessException { |
---|
46 | //determine the format to use |
---|
47 | RichSequenceFormat format; |
---|
48 | if (name.equalsIgnoreCase("fasta")) { |
---|
49 | format = new FastaFormat(); |
---|
50 | } else if (name.equalsIgnoreCase("genbank")) { |
---|
51 | format = new GenbankFormat(); |
---|
52 | } else if (name.equalsIgnoreCase("uniprot")) { |
---|
53 | format = new UniProtFormat(); |
---|
54 | } else if (name.equalsIgnoreCase("embl")) { |
---|
55 | format = new EMBLFormat(); |
---|
56 | } else if (name.equalsIgnoreCase("INSDseq")) { |
---|
57 | format = new INSDseqFormat(); |
---|
58 | } else if (name.equalsIgnoreCase("EMBLxml")) { |
---|
59 | format = new EMBLxmlFormat(); |
---|
60 | } else if (name.equalsIgnoreCase("UniprotXML")){ |
---|
61 | format = new UniProtXMLFormat(); |
---|
62 | } else { |
---|
63 | Class formatClass = Class.forName(name); |
---|
64 | format = (RichSequenceFormat) formatClass.newInstance(); |
---|
65 | } |
---|
66 | return format; |
---|
67 | } |
---|
68 | |
---|
69 | public static void loadNCBITaxon() throws IOException, ParseException{ |
---|
70 | NCBITaxonomyLoader l = new SimpleNCBITaxonomyLoader(); |
---|
71 | BufferedReader nodes = new BufferedReader(new FileReader("nodes.dmp")); |
---|
72 | BufferedReader names = new BufferedReader(new FileReader("names.dmp")); |
---|
73 | |
---|
74 | NCBITaxon t; |
---|
75 | while ((t = l.readNode(nodes)) != null) {} // read all the nodes first |
---|
76 | while ((t = l.readName(names)) != null) {} // then read all the names |
---|
77 | } |
---|
78 | |
---|
79 | /** |
---|
80 | * @param args the command line arguments |
---|
81 | * args[0] the input file name |
---|
82 | * args[1] the input format name or fully qualified classname (eg fasta, or |
---|
83 | * org.biojavax.bio.seq.io.FastaFormat) |
---|
84 | * args[2] the ouput format name (see above) |
---|
85 | * args[3] the alphabet (commonly DNA or Protein) |
---|
86 | * args[4] the namespace (something like gb) |
---|
87 | */ |
---|
88 | public static void main(String[] args) throws Exception { |
---|
89 | BufferedReader br = new BufferedReader(new FileReader(args[0])); |
---|
90 | RichSequenceFormat inFormat = formatForName(args[1]); |
---|
91 | RichSequenceFormat outFormat = formatForName(args[2]); |
---|
92 | FiniteAlphabet alpha = (FiniteAlphabet) AlphabetManager.alphabetForName(args[3]); |
---|
93 | Namespace ns = null; |
---|
94 | SymbolTokenization toke = alpha.getTokenization("default"); |
---|
95 | |
---|
96 | if(! (inFormat.getClass().equals(formatForName("fasta").getClass()) |
---|
97 | || outFormat.getClass().equals(formatForName("fasta").getClass()))){ |
---|
98 | System.out.println("Loading NCBI taxonomy"); |
---|
99 | loadNCBITaxon(); |
---|
100 | } |
---|
101 | |
---|
102 | RichStreamReader sr = new RichStreamReader( |
---|
103 | br, inFormat, toke, |
---|
104 | RichSequenceBuilderFactory.THRESHOLD, |
---|
105 | null); |
---|
106 | |
---|
107 | RichStreamWriter sw = new RichStreamWriter(System.out, outFormat); |
---|
108 | sw.writeStream(sr, ns); |
---|
109 | } |
---|
110 | } |
---|