| 1 | package roundtrip; |
|---|
| 2 | |
|---|
| 3 | import java.io.BufferedReader; |
|---|
| 4 | import java.io.FileReader; |
|---|
| 5 | import java.io.IOException; |
|---|
| 6 | import org.biojava.bio.seq.io.ParseException; |
|---|
| 7 | import org.biojava.bio.seq.io.SymbolTokenization; |
|---|
| 8 | import org.biojava.bio.symbol.AlphabetManager; |
|---|
| 9 | import org.biojava.bio.symbol.FiniteAlphabet; |
|---|
| 10 | import org.biojavax.Namespace; |
|---|
| 11 | import org.biojavax.bio.seq.RichSequence; |
|---|
| 12 | import org.biojavax.bio.seq.io.EMBLFormat; |
|---|
| 13 | import org.biojavax.bio.seq.io.EMBLxmlFormat; |
|---|
| 14 | import org.biojavax.bio.seq.io.FastaFormat; |
|---|
| 15 | import org.biojavax.bio.seq.io.GenbankFormat; |
|---|
| 16 | import org.biojavax.bio.seq.io.INSDseqFormat; |
|---|
| 17 | import org.biojavax.bio.seq.io.RichSequenceBuilderFactory; |
|---|
| 18 | import org.biojavax.bio.seq.io.RichSequenceFormat; |
|---|
| 19 | import org.biojavax.bio.seq.io.RichStreamReader; |
|---|
| 20 | import org.biojavax.bio.seq.io.RichStreamWriter; |
|---|
| 21 | import org.biojavax.bio.seq.io.UniProtFormat; |
|---|
| 22 | import org.biojavax.bio.seq.io.UniProtXMLFormat; |
|---|
| 23 | import org.biojavax.bio.taxa.NCBITaxon; |
|---|
| 24 | import org.biojavax.bio.taxa.io.NCBITaxonomyLoader; |
|---|
| 25 | import org.biojavax.bio.taxa.io.SimpleNCBITaxonomyLoader; |
|---|
| 26 | |
|---|
| 27 | /** |
|---|
| 28 | * This program will round trip sequence formats |
|---|
| 29 | * @author Mark |
|---|
| 30 | */ |
|---|
| 31 | public class Main { |
|---|
| 32 | |
|---|
| 33 | /** |
|---|
| 34 | * Attempts to find a format for a name String such as "genbank" or for a |
|---|
| 35 | * fully qualified string like org.biojavax.bio.seq.io.UniProtFormat |
|---|
| 36 | * @return the matching <code>RichSequenceFormat</code> |
|---|
| 37 | * @param name the name of the format, case insensitive except for qualified class names |
|---|
| 38 | * @throws java.lang.IllegalAccessException If java cannot reflectively access the named format. |
|---|
| 39 | * Only applies to fully qualified class names. |
|---|
| 40 | * @throws java.lang.ClassNotFoundException If a format can not be found for the name. |
|---|
| 41 | * @throws java.lang.InstantiationException If the found object cannot be created (only applies |
|---|
| 42 | * to fully qualified class names). |
|---|
| 43 | */ |
|---|
| 44 | public static RichSequenceFormat formatForName(String name) |
|---|
| 45 | throws ClassNotFoundException, InstantiationException, IllegalAccessException { |
|---|
| 46 | //determine the format to use |
|---|
| 47 | RichSequenceFormat format; |
|---|
| 48 | if (name.equalsIgnoreCase("fasta")) { |
|---|
| 49 | format = new FastaFormat(); |
|---|
| 50 | } else if (name.equalsIgnoreCase("genbank")) { |
|---|
| 51 | format = new GenbankFormat(); |
|---|
| 52 | } else if (name.equalsIgnoreCase("uniprot")) { |
|---|
| 53 | format = new UniProtFormat(); |
|---|
| 54 | } else if (name.equalsIgnoreCase("embl")) { |
|---|
| 55 | format = new EMBLFormat(); |
|---|
| 56 | } else if (name.equalsIgnoreCase("INSDseq")) { |
|---|
| 57 | format = new INSDseqFormat(); |
|---|
| 58 | } else if (name.equalsIgnoreCase("EMBLxml")) { |
|---|
| 59 | format = new EMBLxmlFormat(); |
|---|
| 60 | } else if (name.equalsIgnoreCase("UniprotXML")){ |
|---|
| 61 | format = new UniProtXMLFormat(); |
|---|
| 62 | } else { |
|---|
| 63 | Class formatClass = Class.forName(name); |
|---|
| 64 | format = (RichSequenceFormat) formatClass.newInstance(); |
|---|
| 65 | } |
|---|
| 66 | return format; |
|---|
| 67 | } |
|---|
| 68 | |
|---|
| 69 | public static void loadNCBITaxon() throws IOException, ParseException{ |
|---|
| 70 | NCBITaxonomyLoader l = new SimpleNCBITaxonomyLoader(); |
|---|
| 71 | BufferedReader nodes = new BufferedReader(new FileReader("nodes.dmp")); |
|---|
| 72 | BufferedReader names = new BufferedReader(new FileReader("names.dmp")); |
|---|
| 73 | |
|---|
| 74 | NCBITaxon t; |
|---|
| 75 | while ((t = l.readNode(nodes)) != null) {} // read all the nodes first |
|---|
| 76 | while ((t = l.readName(names)) != null) {} // then read all the names |
|---|
| 77 | } |
|---|
| 78 | |
|---|
| 79 | /** |
|---|
| 80 | * @param args the command line arguments |
|---|
| 81 | * args[0] the input file name |
|---|
| 82 | * args[1] the input format name or fully qualified classname (eg fasta, or |
|---|
| 83 | * org.biojavax.bio.seq.io.FastaFormat) |
|---|
| 84 | * args[2] the ouput format name (see above) |
|---|
| 85 | * args[3] the alphabet (commonly DNA or Protein) |
|---|
| 86 | * args[4] the namespace (something like gb) |
|---|
| 87 | */ |
|---|
| 88 | public static void main(String[] args) throws Exception { |
|---|
| 89 | BufferedReader br = new BufferedReader(new FileReader(args[0])); |
|---|
| 90 | RichSequenceFormat inFormat = formatForName(args[1]); |
|---|
| 91 | RichSequenceFormat outFormat = formatForName(args[2]); |
|---|
| 92 | FiniteAlphabet alpha = (FiniteAlphabet) AlphabetManager.alphabetForName(args[3]); |
|---|
| 93 | Namespace ns = null; |
|---|
| 94 | SymbolTokenization toke = alpha.getTokenization("default"); |
|---|
| 95 | |
|---|
| 96 | if(! (inFormat.getClass().equals(formatForName("fasta").getClass()) |
|---|
| 97 | || outFormat.getClass().equals(formatForName("fasta").getClass()))){ |
|---|
| 98 | System.out.println("Loading NCBI taxonomy"); |
|---|
| 99 | loadNCBITaxon(); |
|---|
| 100 | } |
|---|
| 101 | |
|---|
| 102 | RichStreamReader sr = new RichStreamReader( |
|---|
| 103 | br, inFormat, toke, |
|---|
| 104 | RichSequenceBuilderFactory.THRESHOLD, |
|---|
| 105 | null); |
|---|
| 106 | |
|---|
| 107 | RichStreamWriter sw = new RichStreamWriter(System.out, outFormat); |
|---|
| 108 | sw.writeStream(sr, ns); |
|---|
| 109 | } |
|---|
| 110 | } |
|---|