src/gnDNXSource.cpp

Go to the documentation of this file.
00001 00002 // File: gnDNXSource.h 00003 // Purpose: Implements gnBaseSource for .DNX files 00004 // Description: 00005 // Changes: 00006 // Version: libGenome 0.5.1 00007 // Author: Aaron Darling 00008 // Modified by: 00009 // Copyright: (c) Aaron Darling 00010 // Licenses: See COPYING file for details 00012 #include "gn/gnDNXSource.h" 00013 #include "gn/gnSourceSpec.h" 00014 #include "gn/gnStringSpec.h" 00015 #include "gn/gnSourceFactory.h" 00016 #include "gn/gnFASSource.h" 00017 #include "gn/gnGBKSource.h" 00018 #include "gn/gnBaseHeader.h" 00019 #include "gn/gnFilter.h" 00020 #include "gn/gnDebug.h" 00021 #include "gn/gnStringTools.h" 00022 #include <string> 00023 00024 gnDNXSource::gnDNXSource() 00025 { 00026 m_DNXSpec = new gnGenomeSpec(); 00027 m_pFilter = gnFilter::fullDNASeqFilter(); 00028 if(m_pFilter == NULL){ 00029 DebugMsg("Error using static sequence filters."); 00030 } 00031 } 00032 00033 gnDNXSource::gnDNXSource( const gnDNXSource& s ) : gnFileSource(s) 00034 { 00035 if(s.m_DNXSpec != NULL) 00036 m_DNXSpec = s.m_DNXSpec->Clone(); 00037 } 00038 00039 gnDNXSource::~gnDNXSource() 00040 { 00041 m_ifstream.close(); 00042 delete m_DNXSpec; 00043 } 00044 boolean gnDNXSource::HasContig( const string& name ) const 00045 { 00046 for(uint32 contigI = 0; contigI < m_DNXSpec->GetSpecListLength(); contigI++){ 00047 if(m_DNXSpec->GetSpec(contigI)->GetName() == name) 00048 return true; 00049 } 00050 return false; 00051 } 00052 uint32 gnDNXSource::GetContigID( const string& name ) const 00053 { 00054 for(uint32 contigI = 0; contigI < m_DNXSpec->GetSpecListLength(); contigI++){ 00055 if(m_DNXSpec->GetSpec(contigI)->GetName() == name) 00056 return contigI; 00057 } 00058 return ALL_CONTIGS; 00059 } 00060 string gnDNXSource::GetContigName( const uint32 i ) const 00061 { 00062 if(i < m_DNXSpec->GetSpecListLength()){ 00063 gnBaseSpec *gnbs = m_DNXSpec->GetSpec(i); 00064 return gnbs->GetName(); 00065 } 00066 return ""; 00067 } 00068 gnSeqI gnDNXSource::GetContigSeqLength( const uint32 i ) const 00069 { 00070 if( i == ALL_CONTIGS){ 00071 return m_DNXSpec->GetLength(); 00072 }else if(i < m_DNXSpec->GetSpecListLength()){ 00073 gnBaseSpec *gnbs = m_DNXSpec->GetSpec(i); 00074 return gnbs->GetLength(); 00075 } 00076 return 0; 00077 } 00078 //read raw data from the file 00079 00080 void gnDNXSource::ValidateName(string& name){ 00081 if(name == ""){ //make a random one. 00082 name.resize(4); 00083 srand(time(NULL)); 00084 for(int i=0; i < 4; i++) 00085 name[i] = (rand() % 26) + 64; 00086 } 00087 } 00088 00089 boolean gnDNXSource::Write(gnGenomeSpec* spec, const string& filename){ 00090 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary); 00091 gnSourceFactory* m_sSourceFactory = gnSourceFactory::GetSourceFactory(); 00092 if(!m_ofstream.is_open()) 00093 return false; 00094 for(uint32 i=0; i < spec->GetSpecListLength(); i++){ //each of these will be dnx statements 00095 gnFragmentSpec* curStatementSpec = spec->GetSpec(i); 00096 string sourceName = spec->GetSourceName(); 00097 string statementName = spec->GetName(); 00098 if(!m_sSourceFactory->HasSource(sourceName)){ 00099 ValidateName(statementName); 00100 statementName += ".seq"; 00101 m_ofstream << statementName << "="; 00102 }else 00103 m_ofstream << sourceName << "="; 00104 for(uint32 j=0; j < curStatementSpec->GetSpecListLength(); j++){ //each of these will be the files 00105 //referred to by the dnx statement 00106 gnContigSpec* curSubSpec = curStatementSpec->GetSpec(i); 00107 sourceName = curStatementSpec->GetSourceName(); 00108 string contigName = curStatementSpec->GetName(); 00109 if(!m_sSourceFactory->HasSource(sourceName)){ 00110 ValidateName(contigName); 00111 string writename = contigName+".seq"; 00112 gnSequence gns = *curSubSpec; 00113 gnGBKSource::Write(gns, writename); 00114 m_ofstream << writename; 00115 }else 00116 m_ofstream << sourceName; 00117 if( j + 1 < curStatementSpec->GetSpecListLength()) 00118 m_ofstream << "+"; 00119 } 00120 m_ofstream << ";"; 00121 gnBaseHeader *gpbh = spec->GetHeader(0); 00122 string header = ""; 00123 if(gpbh != NULL){ 00124 header = gpbh->GetHeader(); 00125 //delete everything after the first newline. 00126 uint32 newlinepos = header.find_first_of('\n', 0); 00127 if(newlinepos != string::npos) 00128 header = header.substr(0, newlinepos - 1); 00129 } 00130 m_ofstream << header << "\r\n"; 00131 } 00132 m_ofstream.close(); 00133 return true; 00134 } 00135 00136 gnFileContig* gnDNXSource::GetFileContig( const uint32 contigI ) const{ 00137 return NULL; //returning NULL 00138 } 00139 00140 //reads an inputstream and creates fills the spec vector appropriately 00141 boolean gnDNXSource::ParseStream( istream& fin ) 00142 { 00143 // INIT temp varables 00144 uint32 readState = 0; //10 - currently inside a comment 00145 uint32 sectionStart = 0; 00146 gnFragmentSpec* currentFragSpec = 0; 00147 gnBaseSource *currentSource; 00148 string currentSourceName; 00149 uint32 currentContig = ALL_CONTIGS; 00150 uint32 currentSeqStart = 0; 00151 boolean currentRevComp = false; 00152 // INIT buffer 00153 uint64 bufReadLen = 0; 00154 uint64 remainingBuffer = 0; 00155 Array<char> array_buf( BUFFER_SIZE ); 00156 char* buf = array_buf.data; 00157 string curliteral; 00158 00159 //Get the source factory and add the current dnx path to it. 00160 gnSourceFactory *sourceFactory = gnSourceFactory::GetSourceFactory(); 00161 sourceFactory->AddPath(getPathString(m_openString)); 00162 00163 while( !fin.eof() ) 00164 { 00165 if(sectionStart > 0){ 00166 remainingBuffer = bufReadLen - sectionStart; 00167 if(readState == 5){ //add literal 00168 curliteral += string(buf, sectionStart, remainingBuffer); 00169 remainingBuffer = 0; 00170 sectionStart = bufReadLen; 00171 }else 00172 memmove(buf, buf+sectionStart, remainingBuffer); 00173 } 00174 // read chars 00175 fin.read( buf + remainingBuffer, BUFFER_SIZE - (bufReadLen - sectionStart)); 00176 sectionStart = 0; 00177 bufReadLen = fin.gcount() + remainingBuffer; 00178 00179 for( uint32 i=0 ; i < bufReadLen ; i++ ) 00180 { 00181 char ch = buf[i]; 00182 switch( readState ) 00183 { 00184 case 0: // Get name of genome 00185 if(ch == '='){ 00186 //genome name is from sectionStart to i 00187 string contigName(buf+sectionStart, i - sectionStart); 00188 currentFragSpec = new gnFragmentSpec(); 00189 currentFragSpec->SetName(contigName); 00190 currentFragSpec->SetSourceName(m_openString); 00191 m_DNXSpec->AddSpec(currentFragSpec); 00192 sectionStart = i+1; 00193 readState = 1; 00194 } 00195 break; 00196 case 1: // Ignore whitespace before filename or literal 00197 if((ch == ' ')||(ch == ' ')) 00198 break; 00199 case 2: // Are we getting a new source file name or a literal? 00200 if(ch == '"'){ //getting a literal 00201 readState = 5; 00202 sectionStart = i+1; 00203 break; 00204 } 00205 readState = 3; 00206 sectionStart = i; 00207 case 3: // Get a new source file name 00208 //stop on >, (, +, and \n 00209 if(ch == '\n' && sectionStart == i -1){ 00210 if(buf[sectionStart]=='\r'){ 00211 sectionStart = i + 1; 00212 break; 00213 } 00214 } 00215 if((ch == '+')||(ch == '>')||(ch == '(')||(ch == '\n')||(ch == ';')){ 00216 //use the entire source file 00217 string seqfile(buf, sectionStart, i - sectionStart); 00218 currentSourceName = seqfile; 00219 currentSource = sourceFactory->AddSource(seqfile, true); 00220 if (currentSource==NULL) 00221 { 00222 return false; 00223 } 00224 if((ch == '+')||(ch == '\n')||(ch == ';')){ 00225 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource); 00226 tmp_spec->SetSourceName(seqfile); 00227 currentFragSpec->AddSpec(tmp_spec); 00228 readState = 1; 00229 if(ch == '\n'){ //reached the end of the statement. parse another. 00230 readState = 0; 00231 }else if(ch == ';'){ //hit a comment. 00232 readState = 9; 00233 } 00234 }else if(ch == '>'){ //select a contig to use 00235 readState = 4; 00236 }else if(ch == '('){ // use a specified section of the entire file 00237 readState = 6; 00238 } 00239 sectionStart = i + 1; 00240 } 00241 break; 00242 case 4: // Get a specific contig to use 00243 //stop on (, +, ;, and \n 00244 if((ch == '+')||(ch == '\n')||(ch == ';')||(ch == '(')){ 00245 //use the entire contig 00246 string contigname(buf, sectionStart, i - sectionStart); 00247 currentContig = currentSource->GetContigID(contigname); 00248 if((ch == '+')||(ch == '\n')||(ch == ';')){ 00249 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource, currentContig); 00250 tmp_spec->SetSourceName(currentSourceName); 00251 currentFragSpec->AddSpec(tmp_spec); 00252 readState = 1; 00253 if(ch == '\n'){ //reached the end of the statement. parse another. 00254 readState = 0; 00255 }else if(ch == ';'){ //hit a comment. 00256 readState = 9; 00257 } 00258 }else if(ch == '('){ //use the specified section 00259 readState = 6; 00260 } 00261 sectionStart = i + 1; 00262 } 00263 break; 00264 case 5: // read in a literal 00265 // stop on " 00266 if(ch == '"'){ 00267 //now create a string spec from sectionStart to i-1 00268 string literal(buf, sectionStart, i - sectionStart); 00269 if(curliteral.length() > 0){ 00270 literal += curliteral; 00271 curliteral = ""; 00272 } 00273 gnStringSpec *gpss = new gnStringSpec(literal, currentFragSpec->GetSpecListLength()); 00274 currentFragSpec->AddSpec(gpss); 00275 } 00276 case 6: // read in a specified section 00277 //stop on , or < or > 00278 if((ch == ',') || (ch == '<') || (ch == '>')){ 00279 string seqstartstring(buf, sectionStart, i - sectionStart); 00280 if(seqstartstring == "lend"){ 00281 currentSeqStart = 0; 00282 }else if (seqstartstring == "rend"){ 00283 currentSeqStart = GNSEQI_END; 00284 }else 00285 currentSeqStart = atoi(seqstartstring.c_str()) - 1; 00286 if(ch == '<') 00287 currentRevComp = true; 00288 sectionStart = i + 1; 00289 readState = 7; 00290 } 00291 break; 00292 case 7: // read in the second half of a specified section 00293 //stop on ) 00294 if(ch == ')'){ 00295 string seqendstring(buf, sectionStart, i - sectionStart); 00296 uint32 currentSeqEnd = GNSEQI_END; 00297 if(seqendstring == "lend"){ 00298 currentSeqEnd = 0; 00299 }else if (seqendstring == "rend"){ 00300 currentSeqEnd = GNSEQI_END; 00301 }else 00302 currentSeqEnd = atoi(seqendstring.c_str()) - 1; 00303 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource, currentContig, currentSeqStart, currentSeqEnd, currentRevComp); 00304 tmp_spec->SetSourceName(currentSourceName); 00305 currentFragSpec->AddSpec(tmp_spec); 00306 currentRevComp = false; //set it back to its default value. 00307 sectionStart = i + 1; 00308 readState = 8; //look for connective operator 00309 } 00310 break; 00311 case 8: //skip whitespace until a connective or terminating operator is reached. 00312 if(ch == '+'){ 00313 sectionStart = i + 1; 00314 readState = 1; //start over 00315 } 00316 if(ch == '\n'){ 00317 sectionStart = i + 1; 00318 readState = 0; 00319 } 00320 if(ch == ';'){ 00321 sectionStart = i + 1; 00322 readState = 9; 00323 } 00324 break; 00325 case 9: //skip comment until newline. 00326 if(ch == '\n'){ 00327 sectionStart = i + 1; 00328 readState = 0; 00329 } 00330 break; 00331 default: 00332 DebugMsg("ERROR in file\n"); 00333 return false; 00334 break; 00335 } 00336 }// for all buf 00337 }// while !eof 00338 // CLEAN UP 00339 return true; 00340 }

Generated on Mon Feb 14 19:28:19 2005 for libGenome by doxygen 1.3.8