001 package org.jaga.exampleApplications.proteinLocation;
002
003 import java.io.*;
004 import java.util.*;
005
006 /**
007 * TODO: Complete these comments.
008 *
009 * <p><u>Project:</u> JAGA - Java API for Genetic Algorithms.</p>
010 *
011 * <p><u>Company:</u> University College London and JAGA.Org
012 * (<a href="http://www.jaga.org" target="_blank">http://www.jaga.org</a>).
013 * </p>
014 *
015 * <p><u>Copyright:</u> (c) 2004 by G. Paperin.<br/>
016 * This program is free software; you can redistribute it and/or modify
017 * it under the terms of the GNU General Public License as published by
018 * the Free Software Foundation, ONLY if you include a note of the original
019 * author(s) in any redistributed/modified copy.<br/>
020 * This program is distributed in the hope that it will be useful,
021 * but WITHOUT ANY WARRANTY; without even the implied warranty of
022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
023 * GNU General Public License for more details.<br/>
024 * You should have received a copy of the GNU General Public License
025 * along with this program; if not, write to the Free Software
026 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
027 * or see http://www.gnu.org/licenses/gpl.html</p>
028 *
029 * @author Greg Paperin (greg@jaga.org)
030 *
031 * @version JAGA public release 1.0 beta
032 */
033
034 public class DataSplitter {
035
036 class Protein {
037 String name = "<untitled>";
038 String sequence = "";
039 Protein() {}
040 Protein(String n, String s) { name = n; sequence = s; }
041 }
042
043 public static final double testSetProportion = 0.1;
044
045 public static final int outputLineLen = 70;
046
047 public static final String dataDir = "D:/Courseworks/4C58/cw/data/";
048
049 public static final String [] sourceFiles = new String [] {
050 dataDir + "Cyto_euk.fasta",
051 dataDir + "Extra_euk.fasta",
052 dataDir + "Nuclear.fasta",
053 dataDir + "Mito.fasta"};
054
055 public static final String [] trainDestFiles = new String [] {
056 dataDir + "Cytosol.train.dat",
057 dataDir + "Extracellular.train.dat",
058 dataDir + "Nucleus.train.dat",
059 dataDir + "Mitochondrion.train.dat"};
060
061 public static final String [] validDestFiles = new String [] {
062 dataDir + "Cytosol.valid.dat",
063 dataDir + "Extracellular.valid.dat",
064 dataDir + "Nucleus.valid.dat",
065 dataDir + "Mitochondrion.valid.dat"};
066
067 private ArrayList [] prots = new ArrayList [] {new ArrayList(), new ArrayList(),
068 new ArrayList(), new ArrayList()};
069
070 public DataSplitter() {}
071
072 private void loadProtein(String fname, ArrayList protList) throws IOException {
073 int count = 0;
074 System.out.print("Loading from " + fname + ". . . ");
075 BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(fname)));
076 try {
077 String s = in.readLine();
078 while (null != s) {
079
080 DataSplitter.Protein p = new DataSplitter.Protein();
081 p.name = s.trim().substring(1);
082
083 s = in.readLine();
084 while (null != s && '>' != s.trim().charAt(0)) {
085 p.sequence += s.trim();
086 s = in.readLine();
087 }
088 protList.add(p);
089 count++;
090 }
091 } finally {
092 in.close();
093 }
094 System.out.println(". . . Done. " + count + " proteins loaded.");
095 }
096
097 private void loadProteins() throws IOException {
098 for (int i = 0; i < 4; i++)
099 loadProtein(sourceFiles[i], prots[i]);
100 }
101
102 private void randomise() {
103 ;
104 }
105
106 private void saveProtein(DataSplitter.Protein [] protList, String fname) throws IOException {
107 System.out.print("Saving " + protList.length + " proteins to " + fname + ". . . ");
108 PrintWriter out = new PrintWriter(new FileOutputStream(fname));
109 try {
110 for (int i = 0; i < protList.length; i++) {
111 out.println(">" + protList[i].name);
112 int p = 0;
113 String s;
114 while (p < protList[i].sequence.length()) {
115 if (p + outputLineLen < protList[i].sequence.length())
116 s = protList[i].sequence.substring(p, p + outputLineLen);
117 else
118 s = protList[i].sequence.substring(p);
119 p += outputLineLen;
120 out.println(s);
121 }
122 }
123 } finally {
124 out.close();
125 }
126 System.out.println(". . . Done.");
127 }
128
129 private void saveProteins() throws IOException {
130 for (int i = 0; i < 4; i++) {
131
132 int validLen = (int) (testSetProportion * (double) prots[i].size());
133
134 DataSplitter.Protein [] train = new DataSplitter.Protein[prots[i].size() - validLen];
135 for (int j = 0; j < train.length; j++)
136 train[j] = (DataSplitter.Protein) prots[i].get(j);
137
138 DataSplitter.Protein [] valid = new DataSplitter.Protein[validLen];
139 for (int j = 0; j < validLen; j++)
140 valid[j] = (DataSplitter.Protein) prots[i].get(j + train.length);
141
142 saveProtein(train, trainDestFiles[i]);
143 saveProtein(valid, validDestFiles[i]);
144 }
145 }
146
147 public void exec() {
148 try {
149 loadProteins();
150 randomise();
151 saveProteins();
152 } catch (IOException e) {
153 e.printStackTrace();
154 }
155 }
156
157 public static void main(String[] args) {
158 DataSplitter dataSplitter = new DataSplitter();
159 dataSplitter.exec();
160 }
161
162 }