001    package org.jaga.exampleApplications.proteinLocation;
002    
003    import java.io.*;
004    import java.util.*;
005    
006    /**
007     * TODO: Complete these comments.
008     *
009     * <p><u>Project:</u> JAGA - Java API for Genetic Algorithms.</p>
010     *
011     * <p><u>Company:</u> University College London and JAGA.Org
012     *    (<a href="http://www.jaga.org" target="_blank">http://www.jaga.org</a>).
013     * </p>
014     *
015     * <p><u>Copyright:</u> (c) 2004 by G. Paperin.<br/>
016     *    This program is free software; you can redistribute it and/or modify
017     *    it under the terms of the GNU General Public License as published by
018     *    the Free Software Foundation, ONLY if you include a note of the original
019     *    author(s) in any redistributed/modified copy.<br/>
020     *    This program is distributed in the hope that it will be useful,
021     *    but WITHOUT ANY WARRANTY; without even the implied warranty of
022     *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
023     *    GNU General Public License for more details.<br/>
024     *    You should have received a copy of the GNU General Public License
025     *    along with this program; if not, write to the Free Software
026     *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
027     *    or see http://www.gnu.org/licenses/gpl.html</p>
028     *
029     * @author Greg Paperin (greg@jaga.org)
030     *
031     * @version JAGA public release 1.0 beta
032     */
033    
034    public class DataSplitter {
035    
036            class Protein {
037                    String name = "<untitled>";
038                    String sequence = "";
039                    Protein() {}
040                    Protein(String n, String s) { name = n; sequence = s; }
041            }
042    
043            public static final double testSetProportion = 0.1;
044    
045            public static final int outputLineLen = 70;
046    
047            public static final String dataDir = "D:/Courseworks/4C58/cw/data/";
048    
049            public static final String [] sourceFiles = new String [] {
050                                                                                               dataDir + "Cyto_euk.fasta",
051                                                                                               dataDir + "Extra_euk.fasta",
052                                                                                               dataDir + "Nuclear.fasta",
053                                                                                               dataDir + "Mito.fasta"};
054    
055            public static final String [] trainDestFiles = new String [] {
056                                                                                               dataDir + "Cytosol.train.dat",
057                                                                                               dataDir + "Extracellular.train.dat",
058                                                                                               dataDir + "Nucleus.train.dat",

059                                                                                               dataDir + "Mitochondrion.train.dat"};
060    
061            public static final String [] validDestFiles = new String [] {
062                                                                                               dataDir + "Cytosol.valid.dat",
063                                                                                               dataDir + "Extracellular.valid.dat",
064                                                                                               dataDir + "Nucleus.valid.dat",
065                                                                                               dataDir + "Mitochondrion.valid.dat"};
066    
067            private ArrayList [] prots = new ArrayList [] {new ArrayList(), new ArrayList(),
068                                                                                                       new ArrayList(), new ArrayList()};
069    
070            public DataSplitter() {}
071    
072            private void loadProtein(String fname, ArrayList protList) throws IOException {
073                    int count = 0;
074                    System.out.print("Loading from " + fname + ". . . ");
075                    BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(fname)));
076                    try {
077                            String s = in.readLine();
078                            while (null != s) {
079    
080                                    DataSplitter.Protein p = new DataSplitter.Protein();
081                                    p.name = s.trim().substring(1);
082    
083                                    s = in.readLine();
084                                    while (null != s && '>' != s.trim().charAt(0)) {
085                                            p.sequence += s.trim();
086                                            s = in.readLine();
087                                    }
088                                    protList.add(p);
089                                    count++;
090                            }
091                    } finally {
092                            in.close();
093                    }
094                    System.out.println(". . . Done. " + count + " proteins loaded.");
095            }
096    
097            private void loadProteins() throws IOException {
098                    for (int i = 0; i < 4; i++)
099                            loadProtein(sourceFiles[i], prots[i]);
100            }
101    
102            private void randomise() {
103                    ;
104            }
105    
106            private void saveProtein(DataSplitter.Protein [] protList, String fname) throws IOException {
107                    System.out.print("Saving " + protList.length + " proteins to " + fname + ". . . ");
108                    PrintWriter out = new PrintWriter(new FileOutputStream(fname));
109                    try {
110                            for (int i = 0; i < protList.length; i++) {
111                                    out.println(">" + protList[i].name);
112                                    int p = 0;
113                                    String s;
114                                    while (p < protList[i].sequence.length()) {
115                                            if (p + outputLineLen < protList[i].sequence.length())
116                                                    s = protList[i].sequence.substring(p, p + outputLineLen);
117                                            else
118                                                    s = protList[i].sequence.substring(p);
119                                            p += outputLineLen;
120                                            out.println(s);
121                                    }
122                            }
123                    } finally {
124                            out.close();
125                    }
126                    System.out.println(". . . Done.");
127            }
128    
129            private void saveProteins() throws IOException {
130                    for (int i = 0; i < 4; i++) {
131    
132                            int validLen = (int) (testSetProportion * (double) prots[i].size());
133    
134                            DataSplitter.Protein [] train = new DataSplitter.Protein[prots[i].size() - validLen];
135                            for (int j = 0; j < train.length; j++)
136                                    train[j] = (DataSplitter.Protein) prots[i].get(j);
137    
138                            DataSplitter.Protein [] valid = new DataSplitter.Protein[validLen];
139                            for (int j = 0; j < validLen; j++)
140                                    valid[j] = (DataSplitter.Protein) prots[i].get(j + train.length);
141    
142                            saveProtein(train, trainDestFiles[i]);
143                            saveProtein(valid, validDestFiles[i]);
144                    }
145            }
146    
147            public void exec() {
148                    try {
149                            loadProteins();
150                            randomise();
151                            saveProteins();
152                    } catch (IOException e) {
153                            e.printStackTrace();
154                    }
155            }
156    
157            public static void main(String[] args) {
158                    DataSplitter dataSplitter = new DataSplitter();
159                    dataSplitter.exec();
160            }
161    
162    }