src/gnSEQSource.cpp

Go to the documentation of this file.
00001 00002 // File: gnSEQSource.h 00003 // Purpose: Implements gnBaseSource for .SEQ files 00004 // Description: 00005 // Changes: 00006 // Version: libGenome 0.5.1 00007 // Author: Aaron Darling 00008 // Modified by: 00009 // Copyright: (c) Aaron Darling 00010 // Licenses: See COPYING file for details 00012 #include "gn/gnFilter.h" 00013 #include "gn/gnFeature.h" 00014 #include "gn/gnSEQSource.h" 00015 #include "gn/gnGBKSource.h" 00016 #include "gn/gnSourceSpec.h" 00017 #include "gn/gnSourceHeader.h" 00018 #include "gn/gnSourceQualifier.h" 00019 #include "gn/gnLocation.h" 00020 #include "gn/gnStringTools.h" 00021 #include "gn/gnDebug.h" 00022 00023 gnSEQSource::gnSEQSource() 00024 { 00025 m_openString = ""; 00026 m_pFilter = gnFilter::fullDNASeqFilter(); 00027 if(m_pFilter == NULL){ 00028 DebugMsg("Error using static sequence filters.\n"); 00029 } 00030 } 00031 gnSEQSource::gnSEQSource( const gnSEQSource& s ) : gnFileSource(s) 00032 { 00033 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin(); 00034 for( ; iter != s.m_contigList.end(); ++iter ) 00035 { 00036 m_contigList.push_back( (*iter)->Clone() ); 00037 } 00038 } 00039 gnSEQSource::~gnSEQSource() 00040 { 00041 m_ifstream.close(); 00042 vector< gnFileContig* >::iterator iter = m_contigList.begin(); 00043 for( ; iter != m_contigList.end(); ++iter ) 00044 { 00045 gnFileContig* fg = *iter; 00046 *iter = 0; 00047 delete fg; 00048 } 00049 } 00050 boolean gnSEQSource::HasContig( const string& name ) const 00051 { 00052 for(uint32 i = 0 ; i <= m_contigList.size(); i++ ) 00053 { 00054 if( name == m_contigList[i]->GetName() ) 00055 return true; 00056 } 00057 return false; 00058 } 00059 uint32 gnSEQSource::GetContigID( const string& name ) const 00060 { 00061 for(uint32 i = 0 ; i <= m_contigList.size(); i++ ) 00062 { 00063 if( name == m_contigList[i]->GetName() ) 00064 return i; 00065 } 00066 return ALL_CONTIGS; 00067 } 00068 string gnSEQSource::GetContigName( const uint32 i ) const 00069 { 00070 if( i < m_contigList.size() ) 00071 { 00072 return m_contigList[i]->GetName(); 00073 } 00074 return ""; 00075 } 00076 gnSeqI gnSEQSource::GetContigSeqLength( const uint32 i ) const 00077 { 00078 if( i == ALL_CONTIGS) 00079 return m_spec->GetLength(); 00080 if( i < m_contigList.size() ) 00081 { 00082 return m_contigList[i]->GetSeqLength(); 00083 } 00084 return GNSEQI_ERROR; 00085 } 00086 00087 boolean gnSEQSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI ){ 00088 uint64 startPos = 0; 00089 uint64 readableBytes = 0; 00090 if( !SeqSeek( start, contigI, startPos, readableBytes ) ) 00091 { 00092 bufLen = 0; 00093 return false; 00094 } 00095 00096 if( contigI == ALL_CONTIGS ) 00097 { 00098 uint32 curLen = 0; 00099 uint64 bytesRead = 0; 00100 while (curLen < bufLen) 00101 { 00102 //SeqSeek to start, Figure out how much can be read before SeqSeeking again. 00103 if(readableBytes <= 0) //Look out for zero length contigs! IMPLEMENT ME 00104 if( !SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){ 00105 bufLen = curLen; 00106 return true; 00107 } 00108 //readLen is the amount to read on this pass 00109 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes; 00110 gnSeqC* tmpBuf = new gnSeqC[readLen]; //read into tmpBuf, then filter tmpBuf into curBuf 00111 00112 // read chars and filter 00113 m_ifstream.read(tmpBuf, readLen); 00114 uint64 gc = m_ifstream.gcount(); 00115 bytesRead += gc; 00116 readableBytes -= gc; 00117 for(uint32 i=0; i < gc; i++){ 00118 if( m_pFilter->IsValid(tmpBuf[i]) ){ 00119 buf[curLen] = tmpBuf[i]; 00120 curLen++; 00121 } 00122 } 00123 delete[] tmpBuf; 00124 if(m_ifstream.eof()){ //we hit the end of the file. bail out. 00125 m_ifstream.clear(); 00126 bufLen = curLen; 00127 return true; 00128 } 00129 } 00130 bufLen = curLen; 00131 } 00132 else if( contigI < m_contigList.size() ) 00133 { 00134 uint32 curLen = 0; 00135 //check to see if the buffer is bigger than the contig. if so truncate it. 00136 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength(); 00137 bufLen = bufLen < contigSize ? bufLen : contigSize; 00138 while (curLen < bufLen) 00139 { 00140 uint64 readLen = bufLen - curLen; //the amount to read on this pass 00141 gnSeqC* tmpBuf = new gnSeqC[readLen]; //read into tmpBuf, then filter tmpBuf into curBuf 00142 00143 // read chars and filter 00144 m_ifstream.read(tmpBuf, readLen); 00145 uint64 gc = m_ifstream.gcount(); 00146 // cout << "Read " << gc << " chars from " << m_openString << "\n"; 00147 // cout << "Checking character validity on: " << tmpBuf << "\n"; 00148 for(uint32 i=0; i < gc; i++){ 00149 if( m_pFilter->IsValid(tmpBuf[i]) ){ 00150 buf[curLen] = tmpBuf[i]; 00151 curLen++; 00152 } 00153 } 00154 if(m_ifstream.eof()){ //we hit the end of the file. bail out. 00155 m_ifstream.clear(); 00156 bufLen = curLen; 00157 return true; 00158 } 00159 delete[] tmpBuf; 00160 } 00161 bufLen = curLen; 00162 } 00163 return true; 00164 00165 } 00166 // private: 00167 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig 00168 // returns startPos, the file offset where the sequence starts 00169 // returns true if successful, false otherwise 00170 boolean gnSEQSource::SeqSeek( const gnSeqI start, const uint32& contigI, uint64& startPos, uint64& readableBytes ) 00171 { 00172 if( contigI == ALL_CONTIGS ) 00173 { 00174 // find first contig 00175 gnSeqI curIndex = 0; 00176 vector< gnFileContig* >::iterator iter = m_contigList.begin(); 00177 for( ; iter != m_contigList.end(); ++iter ) 00178 { 00179 uint64 len = (*iter)->GetSeqLength(); 00180 if( (curIndex + len) > start ) 00181 break; 00182 curIndex += len; 00183 } 00184 if( iter == m_contigList.end() ) 00185 return false; 00186 // seek to start 00187 gnSeqI startIndex = start - curIndex; //startIndex is starting pos. within the contig 00188 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes ); 00189 } 00190 else if( contigI < m_contigList.size() ) 00191 { 00192 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes ); 00193 } 00194 return false; 00195 } 00196 //Returns startPos, the file offset where the sequence starts. 00197 boolean gnSEQSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ) 00198 { 00199 readableBytes = 0; 00200 uint32 curLen = 0; 00201 //seek to the file offset where the contig starts 00202 startPos = contig.GetSectStartEnd(gnContigSequence).first; //set startPos to start where the contig starts 00203 m_ifstream.seekg( startPos, ios::beg ); 00204 if( m_ifstream.eof() ){ 00205 DebugMsg("ERROR in gnSEQSource::Incorrect contig start position, End of file reached!\n"); 00206 return false; 00207 } 00208 while( true ) 00209 { 00210 // READ the rest of the contig skipping over invalid characters until we get to the starting base pair. 00211 // startPos will contain the file offset with the starting base pair 00212 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00213 if(tmpbufsize == 0){ 00214 DebugMsg("ERROR in gnSEQSource: stored contig size is incorrect."); 00215 return false; 00216 } 00217 uint64 startOffset = start; 00218 if(contig.HasRepeatSeqGap()){ 00219 if(contig.GetRepeatSeqGapSize().first > 0){ 00220 if(contig.GetRepeatSeqGapSize().second > 0){ 00221 startOffset += (start*contig.GetRepeatSeqGapSize().second)/contig.GetRepeatSeqGapSize().first; 00222 startPos+=startOffset; 00223 m_ifstream.seekg(startPos , ios::beg); 00224 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00225 return true; 00226 } 00227 }else{ 00228 startPos+=start; 00229 m_ifstream.seekg(startPos , ios::beg); 00230 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00231 return true; 00232 } 00233 } 00234 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE; //read in the smaller of the two. 00235 char *tmpbuf = new char[tmpbufsize]; 00236 m_ifstream.read( tmpbuf, tmpbufsize ); 00237 if( m_ifstream.eof() ){ 00238 ErrorMsg("ERROR in gnSEQSource::Read End of file reached!\n"); 00239 delete[] tmpbuf; 00240 return false; 00241 } 00242 for( uint32 i=0; i < tmpbufsize; ++i ){ 00243 if( m_pFilter->IsValid(tmpbuf[i]) ){ 00244 if( curLen >= start ){ //stop when we reach the starting offset within this contig 00245 startPos += i; 00246 m_ifstream.seekg( startPos, ios::beg ); //seek to startPos 00247 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00248 delete[] tmpbuf; 00249 return true; 00250 } 00251 ++curLen; //each time we read a valid b.p., increment the sequence length 00252 } 00253 } 00254 startPos += tmpbufsize; 00255 delete[] tmpbuf; 00256 } 00257 return true; 00258 } 00259 00260 00261 //IMPLEMENT ME! move these static methods somewhere else! especially basecount! 00262 void gnSEQSource::BaseCount(const string& bases, gnSeqI& a_count, gnSeqI& c_count, gnSeqI& g_count, gnSeqI& t_count, gnSeqI& other_count){ 00263 a_count = 0; 00264 c_count = 0; 00265 g_count = 0; 00266 t_count = 0; 00267 other_count = 0; 00268 for(uint32 i=0; i < bases.length(); i++){ 00269 if((bases[i] == 'a')||(bases[i] == 'A')) 00270 a_count++; 00271 else if((bases[i] == 'c')||(bases[i] == 'C')) 00272 c_count++; 00273 else if((bases[i] == 'g')||(bases[i] == 'G')) 00274 g_count++; 00275 else if((bases[i] == 't')||(bases[i] == 'T')) 00276 t_count++; 00277 else 00278 other_count++; 00279 } 00280 } 00281 00282 void gnSEQSource::FormatString(string& data, uint32 offset, uint32 width){ 00283 //first remove newlines and corresponding whitespace 00284 string::size_type newline_loc = data.find_first_of('\n', 0); 00285 while(newline_loc != string::npos){ 00286 if(data[newline_loc-1] == '\r') 00287 newline_loc--; 00288 string::size_type text_loc = newline_loc; 00289 while((data[text_loc] == ' ') ||(data[text_loc] == ' ')||(data[text_loc] == '\n')||(data[text_loc] == '\r')){ 00290 text_loc++; 00291 if(text_loc+1 == data.length()) 00292 break; 00293 } 00294 data = (data.substr(0, newline_loc) + " " + data.substr(text_loc)); 00295 newline_loc = data.find_first_of('\n', 0); 00296 } 00297 //now reformat with newlines and whitespace, observing word boundaries... 00298 string output_string = ""; 00299 for(uint32 charI = 0; charI < data.length();){ 00300 //get the substring to append and increment charI 00301 string::size_type base_loc = charI; 00302 string append_string; 00303 while(base_loc - charI <= width){ 00304 string::size_type space_loc = data.find_first_of(' ', base_loc+1); 00305 if(space_loc - charI < width) 00306 base_loc = space_loc; 00307 else if(base_loc == charI){ 00308 //word is too big for one line. split it. 00309 append_string = data.substr(charI, width); 00310 charI+=width; 00311 }else{ 00312 append_string = data.substr(charI, base_loc - charI); 00313 charI = base_loc; 00314 } 00315 } 00316 output_string += string(offset, ' ') + append_string; 00317 if(charI + width < data.length()) 00318 output_string += "\r\n"; 00319 } 00320 data = output_string; 00321 } 00322 00323 boolean gnSEQSource::Write(gnGenomeSpec *spec, const string& filename){ 00324 ErrorMsg("Writing DNAStar SEQ files is not supported at this time. Try again next week.\n"); 00325 return false; 00326 } 00327 00328 gnFileContig* gnSEQSource::GetFileContig( const uint32 contigI ) const{ 00329 if(m_contigList.size() > contigI) 00330 return m_contigList[contigI]; 00331 return NULL; 00332 } 00333 00334 //File parsing access routine 00335 boolean gnSEQSource::ParseStream( istream& fin ) 00336 { 00337 // INIT temp varables 00338 uint32 readState = 0; 00339 uint32 lineStart = 0; 00340 int64 gapstart = -1; 00341 // INIT buffer 00342 uint32 sectionStart = 0; 00343 uint64 streamPos = 0; 00344 uint64 bufReadLen = 0; 00345 uint64 remainingBuffer = 0; 00346 char* buf = new char[BUFFER_SIZE]; 00347 gnFragmentSpec* curFrag = 0; 00348 gnSourceSpec* curSpec = 0; 00349 gnSourceHeader *curHeader; 00350 gnBaseFeature* curFeature; 00351 gnFileContig* curContig = 0; 00352 gnLocation::gnLocationType curBaseLocationType; 00353 gnSeqI curLocationStart; 00354 int32 curStartLength = 0; 00355 int32 curEndLength = 0; 00356 string curLocContig = ""; 00357 string curQualifierName; 00358 uint64 curQualifierStart; 00359 string curContigName = ""; 00360 gnSeqI seqLength = 0; 00361 gnSeqI lineSeqSize = 0; 00362 00363 m_spec = new gnGenomeSpec(); 00364 while( !fin.eof() ) 00365 { 00366 if(sectionStart > 0){ 00367 if(readState == 15) 00368 sectionStart = bufReadLen; 00369 else if(readState == 16) 00370 sectionStart = lineStart; 00371 remainingBuffer = bufReadLen - sectionStart; 00372 memmove(buf, buf+sectionStart, remainingBuffer); 00373 } 00374 // read chars 00375 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer); 00376 streamPos -= remainingBuffer; 00377 lineStart -= sectionStart; 00378 if(gapstart > 0) 00379 gapstart -= sectionStart; 00380 sectionStart = 0; 00381 bufReadLen = fin.gcount(); 00382 bufReadLen += remainingBuffer; 00383 00384 for( uint32 i=remainingBuffer ; i < bufReadLen ; i++ ) 00385 { 00386 char ch = buf[i]; 00387 switch( readState ) 00388 { 00389 case 0: //Assume we are in header at the start of a new line. 00390 //Look for keywords starting in column 1 00391 if((ch == '\n')&&(buf[lineStart] != ' ')&&(buf[lineStart] != ' ')){ //not equal to space or tab 00392 if(curSpec == NULL){ 00393 curSpec = new gnSourceSpec(this, m_spec->GetSpecListLength()); 00394 curFrag = new gnFragmentSpec(); 00395 curFrag->AddSpec(curSpec); 00396 curSpec->SetSourceName(m_openString); 00397 m_spec->AddSpec(curFrag); 00398 } 00399 if(lineStart != sectionStart){ //Add the previous header to our list 00400 uint32 j = SEQ_HEADER_NAME_LENGTH-1; 00401 for(; j > 0; j--) 00402 if((buf[sectionStart+j] != ' ')&&(buf[sectionStart+j] != ' ')) 00403 break; 00404 string header_name = string(buf+sectionStart, j+1); 00405 curHeader = new gnSourceHeader(this, header_name, sectionStart + streamPos, lineStart - sectionStart); 00406 //if this is header info _before_ a locus statement then its a general file header. 00407 if(strncmp(&buf[lineStart], "LOCUS", 5) == 0) 00408 m_spec->AddHeader(curHeader); 00409 else //otherwise its a fragment header. 00410 curFrag->AddHeader(curHeader); 00411 sectionStart = lineStart; 00412 } 00413 00414 if(strncmp(&buf[lineStart], "FEATURES", 8) == 0){ 00415 sectionStart = i + 1; 00416 readState = 1; //read in features 00417 }else if(strncmp(&buf[lineStart], "ORIGIN", 6) == 0){ 00418 curHeader = new gnSourceHeader(this, string("ORIGIN"), sectionStart + streamPos, i - sectionStart + 1); 00419 curFrag->AddHeader(curHeader); 00420 curContig = new gnFileContig(); 00421 curContig->SetName(curContigName); 00422 curContigName = ""; 00423 readState = 13; //read in base pairs 00424 }else if(strncmp(&buf[lineStart], "LOCUS", 5) == 0){ 00425 if(strncmp(&buf[lineStart+SEQ_LOCUS_CIRCULAR_COLUMN-1], "circular", 8) == 0) 00426 curFrag->SetCircular(true); 00427 uint32 j = SEQ_LOCUS_NAME_LENGTH + 1; 00428 for(; j > 0; j--) 00429 if((buf[lineStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')&&(buf[sectionStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')) 00430 break; 00431 curContigName = string(buf+lineStart+SEQ_LOCUS_NAME_COLUMN-1, j+1); 00432 curFrag->SetName(curContigName); 00433 }else if(strncmp(&buf[lineStart], "^^", 2) == 0){ 00434 //start the sequence. 00435 if(curContig == NULL){ 00436 curContig = new gnFileContig(); 00437 curContig->SetName(curContigName); 00438 curContigName = ""; 00439 } 00440 i--; 00441 readState = 14; 00442 break; 00443 } 00444 } 00445 if(ch == '\n') 00446 lineStart = i + 1; 00447 break; 00448 case 1: //look for feature tag in column six. ignore whitespace before feature. 00449 if((ch == ' ')||(ch == ' ')){ 00450 break; 00451 }else if(ch == '\n'){ 00452 lineStart = i + 1; 00453 sectionStart = i + 1; 00454 break; 00455 }else if(sectionStart == i){ //there was no whitespace, we hit a TAG instead 00456 i--; 00457 readState = 0; //Deal with a Header TAG 00458 sectionStart = i + 1; 00459 break; 00460 }else if((i - lineStart == SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1))){ 00461 sectionStart = i; 00462 readState = 2; 00463 } // 00464 case 2: //Get the feature name. stop on whitespace 00465 if((ch == ' ')||(ch == ' ')){ 00466 string featureName(buf+sectionStart, i - sectionStart); 00467 curFeature = new gnFeature(featureName); 00468 curFrag->AddFeature(curFeature); 00469 sectionStart = i + 1; 00470 readState = 3; 00471 } 00472 break; 00473 case 3: //Ignore whitespace before feature location 00474 if((ch == ' ')||(ch == ' ')){ 00475 break; 00476 }else if((ch == '\r')||(ch == '\n')){ 00477 lineStart = i+1; 00478 break; 00479 } 00480 sectionStart = i; 00481 readState = 4; 00482 case 4: //Read a location start. stop on (<.:^ and whitespace 00483 if((ch == ' ')||(ch == ' ')||(ch == '(')||(ch == '.')||(ch=='^')||(ch==':')){ 00484 string starter(buf+sectionStart, i - sectionStart); 00485 if(ch == '('){ 00486 if(starter == "complement") 00487 curFeature->SetLocationType(gnLocation::LT_Complement); 00488 else if(starter == "order") 00489 curFeature->SetLocationType(gnLocation::LT_Order); 00490 else if(starter == "group") 00491 curFeature->SetLocationType(gnLocation::LT_Group); 00492 else if(starter == "one-of") 00493 curFeature->SetLocationType(gnLocation::LT_OneOf); 00494 sectionStart = i + 1; //ignore join since it is default. 00495 break; 00496 }else if(ch == ':'){ 00497 curLocContig = starter; 00498 sectionStart = i + 1; 00499 break; 00500 } 00501 curLocationStart = atoi(starter.c_str()); 00502 readState = 6; //read in end base by default. 00503 if(ch == '.'){ 00504 //go to special state to look for another one. 00505 readState = 5; 00506 sectionStart = i + 1; 00507 break; 00508 }else if(ch == '^'){ 00509 curBaseLocationType = gnLocation::LT_BetweenBases; 00510 }else if((ch == ' ')||(ch == ' ')){ 00511 //no end location go to qualifier 00512 gnLocation curLocation(curLocationStart, curLocationStart); 00513 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength()); 00514 readState = 7; 00515 } 00516 sectionStart = i + 1; 00517 00518 }else if(ch == '<'){ 00519 curStartLength = -1; 00520 sectionStart = i + 1; 00521 }else if(ch == '>'){ 00522 curStartLength = 1; 00523 sectionStart = i + 1; 00524 } 00525 break; 00526 case 5: //look for another period or location start. 00527 if(ch == '.'){ 00528 curBaseLocationType = gnLocation::LT_Standard; 00529 readState = 6; 00530 sectionStart = i + 1; 00531 break; 00532 } 00533 curBaseLocationType = gnLocation::LT_OneOf; 00534 case 6: //see if there's a second location value. stop on >, and whitespace 00535 if(ch == '>'){ 00536 curEndLength = 1; 00537 sectionStart = i + 1; 00538 }else if(ch == '<'){ 00539 curEndLength = -1; 00540 sectionStart = i + 1; 00541 }else if((ch == ' ')||(ch == ' ')||(ch == ',')){ 00542 //read end location 00543 string ender(buf+sectionStart, i - sectionStart); 00544 gnSeqI curLocationEnd = atoi(ender.c_str()); 00545 gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType); 00546 curEndLength = 0; 00547 curStartLength = 0; 00548 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength()); 00549 readState = ch == ',' ? 3 : 7; //read another loc if we need to. 00550 sectionStart = i+1; 00551 } 00552 break; 00553 case 7: //skip to start of qualifier 00554 if((ch != ' ')&&(ch != ' ')&&(lineStart == i)){ 00555 sectionStart = i; // Hit a tag. go deal with it. 00556 readState = 0; 00557 i--; 00558 }else if((ch != ' ')&&(ch != ' ')&&((lineStart == i - SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1)))){ 00559 sectionStart = i; // Hit a feature. go deal with it. 00560 readState = 2; 00561 i--; 00562 }else if(ch == ','){ //oops! another location to read! 00563 sectionStart = i+1; 00564 readState = 3; 00565 }else if(ch == '/'){ //finally, a qualifier. 00566 sectionStart = i+1; 00567 readState = 8; 00568 }else if(ch == '\n') 00569 lineStart = i + 1; 00570 break; 00571 case 8: //get a qualifier, stop on = 00572 if(ch == '='){ 00573 curQualifierName = string(buf+sectionStart, i - sectionStart); 00574 readState = 9; 00575 sectionStart = i+1; 00576 } 00577 break; 00578 case 9: //are we getting a string? look for " or [ 00579 if(ch == '"'){ 00580 readState = 10; 00581 sectionStart = i; 00582 curQualifierStart = i + streamPos; 00583 }else if(ch == '['){ 00584 readState = 11; 00585 sectionStart = i; 00586 }else if((ch == '\r')||(ch == '\n')){ 00587 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart)); 00588 sectionStart = i+1; 00589 readState = 7; //look for another qualifier 00590 } 00591 break; 00592 case 10: //read until the end of the quotation. look out for escaped quotes 00593 if(ch == '"') 00594 readState = 11; 00595 if(ch == '\n'){ 00596 lineStart = i + 1; 00597 } 00598 break; 00599 case 11: 00600 if(ch != '"'){ 00601 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, curQualifierStart, i - sectionStart)); 00602 sectionStart = i+1; 00603 readState = 7; //look for another qualifier. 00604 if(ch == '\n') 00605 lineStart = i + 1; 00606 }else 00607 readState = 10; //quote was escaped. look for another. 00608 break; 00609 case 12: 00610 if(ch == ']'){ 00611 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart)); 00612 sectionStart = i+1; 00613 readState = 7; //look for another qualifier. 00614 } 00615 break; 00616 case 13: //start the sequence read. 00617 if(ch == '^') //stupid blattlab file format. 00618 readState = 14; 00619 else{ 00620 curContig->SetSectStart(gnContigSequence, i + 1 + streamPos); 00621 readState = 16; 00622 } 00623 break; 00624 case 14: //wait for newline before sequence starts. 00625 if(ch == '\n'){ 00626 curContig->SetRepeatSeqGap(true); 00627 lineStart = i + 1; 00628 sectionStart = i + 1; 00629 curContig->SetSectStart(gnContigSequence, i + 1 + streamPos); 00630 readState = 15; 00631 } 00632 break; 00633 case 15: 00634 if(m_pFilter->IsValid(ch)) 00635 seqLength++; 00636 else 00637 curContig->SetRepeatSeqGap(false); 00638 break; 00639 case 16: 00640 if((ch == '/')&&(i==lineStart)){ 00641 readState = 17; 00642 }else if(m_pFilter->IsValid(ch)){ 00643 seqLength++; 00644 lineSeqSize++; 00645 if(gapstart >= 0){ 00646 curContig->SetRepeatGapSize(i - gapstart); 00647 gapstart = -1; 00648 } 00649 }else if(ch == '\n'){ //IMPLEMENT ME! Needs consistent gap size checking 00650 if(sectionStart == lineStart){ 00651 curContig->SetRepeatSeqGap(true); 00652 curContig->SetRepeatSeqSize(seqLength); 00653 gapstart = i; 00654 for(; gapstart >= lineStart; gapstart--) 00655 if(m_pFilter->IsValid(buf[gapstart])) 00656 break; 00657 gapstart++; 00658 }else if(lineSeqSize != curContig->GetRepeatSeqGapSize().first) 00659 curContig->SetRepeatSeqGap(false); 00660 lineSeqSize = 0; 00661 lineStart = i + 1; 00662 } 00663 break; 00664 case 17: 00665 if((ch == '\n')&&(buf[lineStart+1] == '/')){ 00666 curContig->SetSectEnd(gnContigSequence, lineStart - 2 + streamPos); 00667 curContig->SetSeqLength(seqLength); 00668 m_contigList.push_back(curContig); 00669 curContig = 0; 00670 curSpec->SetLength(seqLength); 00671 curSpec = 0; 00672 seqLength = 0; 00673 lineStart = i + 1; 00674 sectionStart = i + 1; 00675 readState = 0; 00676 } 00677 break; 00678 } 00679 } 00680 streamPos += bufReadLen; 00681 } 00682 if(curContig != 0){ 00683 curContig->SetSectEnd(gnContigSequence, streamPos - 1); 00684 curContig->SetSeqLength(seqLength); 00685 m_contigList.push_back(curContig); 00686 curSpec->SetLength(seqLength); 00687 } 00688 if(curSpec != NULL) 00689 if((curFrag->GetFeatureListLength() == 0) && (curFrag->GetHeaderListLength() == 0) 00690 &&(curSpec->GetLength() == 0)){ 00691 m_spec->RemoveSpec(m_spec->GetSpecListLength() - 1); 00692 delete curFrag; 00693 } 00694 m_ifstream.clear(); 00695 delete[] buf; 00696 return true; 00697 }

Generated on Mon Feb 14 19:28:20 2005 for libGenome by doxygen 1.3.8