src/gnGBKSource.cpp

Go to the documentation of this file.
00001 00002 // File: gnGBKSource.h 00003 // Purpose: Implements gnBaseSource for GenBank sequences 00004 // Description: 00005 // Changes: 00006 // Version: libGenome 0.5.1 00007 // Author: Aaron Darling 00008 // Modified by: 00009 // Copyright: (c) Aaron Darling 00010 // Licenses: See COPYING file for details 00012 #include "gn/gnFilter.h" 00013 #include "gn/gnFeature.h" 00014 #include "gn/gnGBKSource.h" 00015 #include "gn/gnSourceSpec.h" 00016 #include "gn/gnSourceHeader.h" 00017 #include "gn/gnSourceQualifier.h" 00018 #include "gn/gnLocation.h" 00019 #include "gn/gnStringTools.h" 00020 #include "gn/gnDebug.h" 00021 #include "gn/gnStringQualifier.h" 00022 #include <string> 00023 00024 gnGBKSource::gnGBKSource() 00025 { 00026 m_openString = ""; 00027 m_pFilter = gnFilter::proteinSeqFilter(); 00028 if(m_pFilter == NULL){ 00029 DebugMsg("Error using static sequence filters."); 00030 } 00031 } 00032 gnGBKSource::gnGBKSource( const gnGBKSource& s ) : gnFileSource(s) 00033 { 00034 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin(); 00035 for( ; iter != s.m_contigList.end(); ++iter ) 00036 { 00037 m_contigList.push_back( (*iter)->Clone() ); 00038 } 00039 } 00040 gnGBKSource::~gnGBKSource() 00041 { 00042 m_ifstream.close(); 00043 vector< gnFileContig* >::iterator iter = m_contigList.begin(); 00044 for( ; iter != m_contigList.end(); ++iter ) 00045 { 00046 gnFileContig* fg = *iter; 00047 *iter = 0; 00048 delete fg; 00049 } 00050 } 00051 boolean gnGBKSource::HasContig( const string& name ) const 00052 { 00053 for(uint32 i = 0 ; i <= m_contigList.size(); i++ ) 00054 { 00055 if( name == m_contigList[i]->GetName() ) 00056 return true; 00057 } 00058 return false; 00059 } 00060 uint32 gnGBKSource::GetContigID( const string& name ) const 00061 { 00062 for(uint32 i = 0 ; i <= m_contigList.size(); i++ ) 00063 { 00064 if( name == m_contigList[i]->GetName() ) 00065 return i; 00066 } 00067 return ALL_CONTIGS; 00068 } 00069 string gnGBKSource::GetContigName( const uint32 i ) const 00070 { 00071 if( i < m_contigList.size() ) 00072 { 00073 return m_contigList[i]->GetName(); 00074 } 00075 return ""; 00076 } 00077 gnSeqI gnGBKSource::GetContigSeqLength( const uint32 i ) const 00078 { 00079 if( i == ALL_CONTIGS) 00080 return m_spec->GetLength(); 00081 if( i < m_contigList.size() ) 00082 { 00083 return m_contigList[i]->GetSeqLength(); 00084 } 00085 return GNSEQI_ERROR; 00086 } 00087 00088 boolean gnGBKSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI ){ 00089 uint64 startPos = 0; 00090 uint64 readableBytes = 0; 00091 if( !SeqSeek( start, contigI, startPos, readableBytes ) ) 00092 { 00093 bufLen = 0; 00094 return false; 00095 } 00096 00097 if( contigI == ALL_CONTIGS ) 00098 { 00099 uint32 curLen = 0; 00100 uint64 bytesRead = 0; 00101 while (curLen < bufLen) 00102 { 00103 //SeqSeek to start, Figure out how much can be read before SeqSeeking again. 00104 if(readableBytes <= 0) //Look out for zero length contigs! IMPLEMENT ME 00105 if( !SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){ 00106 bufLen = curLen; 00107 return true; 00108 } 00109 //readLen is the amount to read on this pass 00110 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes; 00111 Array<gnSeqC> array_buf( readLen ); 00112 gnSeqC* tmpBuf = array_buf.data; 00113 00114 // read chars and filter 00115 m_ifstream.read(tmpBuf, readLen); 00116 uint64 gc = m_ifstream.gcount(); 00117 bytesRead += gc; 00118 readableBytes -= gc; 00119 for(uint32 i=0; i < gc; i++){ 00120 if( m_pFilter->IsValid(tmpBuf[i]) ){ 00121 buf[curLen] = tmpBuf[i]; 00122 curLen++; 00123 } 00124 } 00125 if(m_ifstream.eof()){ //we hit the end of the file. bail out. 00126 m_ifstream.clear(); 00127 bufLen = curLen; 00128 return true; 00129 } 00130 } 00131 bufLen = curLen; 00132 } 00133 else if( contigI < m_contigList.size() ) 00134 { 00135 uint32 curLen = 0; 00136 //check to see if the buffer is bigger than the contig. if so truncate it. 00137 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength(); 00138 bufLen = bufLen < contigSize ? bufLen : contigSize; 00139 while (curLen < bufLen) 00140 { 00141 uint64 readLen = bufLen - curLen; //the amount to read on this pass 00142 Array<gnSeqC> array_buf( readLen ); 00143 gnSeqC* tmpBuf = array_buf.data; 00144 00145 // read chars and filter 00146 m_ifstream.read(tmpBuf, readLen); 00147 uint64 gc = m_ifstream.gcount(); 00148 // cout << "Read " << gc << " chars from " << m_openString << "\n"; 00149 // cout << "Checking character validity on: " << tmpBuf << "\n"; 00150 for(uint32 i=0; i < gc; i++){ 00151 if( m_pFilter->IsValid(tmpBuf[i]) ){ 00152 buf[curLen] = tmpBuf[i]; 00153 curLen++; 00154 } 00155 } 00156 if(m_ifstream.eof()){ //we hit the end of the file. bail out. 00157 m_ifstream.clear(); 00158 bufLen = curLen; 00159 return true; 00160 } 00161 } 00162 bufLen = curLen; 00163 } 00164 return true; 00165 00166 } 00167 // private: 00168 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig 00169 // returns startPos, the file offset where the sequence starts 00170 // returns true if successful, false otherwise 00171 boolean gnGBKSource::SeqSeek( const gnSeqI start, const uint32& contigI, uint64& startPos, uint64& readableBytes ) 00172 { 00173 if( contigI == ALL_CONTIGS ) 00174 { 00175 // find first contig 00176 gnSeqI curIndex = 0; 00177 vector< gnFileContig* >::iterator iter = m_contigList.begin(); 00178 for( ; iter != m_contigList.end(); ++iter ) 00179 { 00180 uint64 len = (*iter)->GetSeqLength(); 00181 if( (curIndex + len) > start ) 00182 break; 00183 curIndex += len; 00184 } 00185 if( iter == m_contigList.end() ) 00186 return false; 00187 // seek to start 00188 gnSeqI startIndex = start - curIndex; //startIndex is starting pos. within the contig 00189 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes ); 00190 } 00191 else if( contigI < m_contigList.size() ) 00192 { 00193 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes ); 00194 } 00195 return false; 00196 } 00197 //Returns startPos, the file offset where the sequence starts. 00198 boolean gnGBKSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ) 00199 { 00200 readableBytes = 0; 00201 uint32 curLen = 0; 00202 //seek to the file offset where the contig starts 00203 startPos = contig.GetSectStartEnd(gnContigSequence).first; //set startPos to start where the contig starts 00204 m_ifstream.seekg( startPos, ios::beg ); 00205 if( m_ifstream.eof() ){ 00206 ErrorMsg("ERROR in gnGBKSource::Incorrect contig start position, End of file reached!\n"); 00207 return false; 00208 } 00209 while( true ) 00210 { 00211 // READ the rest of the contig skipping over invalid characters until we get to the starting base pair. 00212 // startPos will contain the file offset with the starting base pair 00213 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00214 if(tmpbufsize == 0){ 00215 ErrorMsg("ERROR in gnGBKSource: stored contig size is incorrect."); 00216 return false; 00217 } 00218 uint64 startOffset = start; 00219 if(contig.HasRepeatSeqGap()){ //check for sequence integrity 00220 startOffset += (9 + m_newlineSize) * (start / 60 + 1) + start / 10 + 1; 00221 if( m_newlineSize == 2 ) // test compensation for strange bug 00222 startOffset--; 00223 startPos+=startOffset; 00224 m_ifstream.seekg(startPos , ios::beg); 00225 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00226 return true; 00227 } 00228 00229 //sequence is corrupt. read in base by base 00230 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE; //read in the smaller of the two. 00231 Array<char> array_buf( tmpbufsize ); 00232 char* tmpbuf = array_buf.data; 00233 00234 m_ifstream.read( tmpbuf, tmpbufsize ); 00235 if( m_ifstream.eof() ){ 00236 ErrorMsg("ERROR in gnGBKSource::Read End of file reached!\n"); 00237 return false; 00238 } 00239 for( uint32 i=0; i < tmpbufsize; ++i ){ 00240 if( m_pFilter->IsValid(tmpbuf[i]) ){ 00241 if( curLen >= start ){ //stop when we reach the starting offset within this contig 00242 startPos += i; 00243 m_ifstream.seekg( startPos, ios::beg ); //seek to startPos 00244 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00245 return true; 00246 } 00247 ++curLen; //each time we read a valid b.p., increment the sequence length 00248 } 00249 } 00250 startPos += tmpbufsize; 00251 } 00252 return true; 00253 } 00254 00255 void gnGBKSource::FormatString(string& data, uint32 offset, uint32 width){ 00256 //first remove newlines and corresponding whitespace 00257 string::size_type newline_loc = data.find_first_of('\n', 0); 00258 while(newline_loc != string::npos){ 00259 if(data[newline_loc-1] == '\r') 00260 newline_loc--; 00261 string::size_type text_loc = newline_loc; 00262 while((data[text_loc] == ' ') ||(data[text_loc] == ' ')||(data[text_loc] == '\n')||(data[text_loc] == '\r')){ 00263 text_loc++; 00264 if(text_loc+1 == data.length()) 00265 break; 00266 } 00267 data = (data.substr(0, newline_loc) + " " + data.substr(text_loc)); 00268 newline_loc = data.find_first_of('\n', 0); 00269 } 00270 //now reformat with newlines and whitespace, observing word boundaries... 00271 string output_string = ""; 00272 for(uint32 charI = 0; charI < data.length();){ 00273 //get the substring to append and increment charI 00274 string::size_type base_loc = charI; 00275 string append_string; 00276 while(base_loc - charI <= width){ 00277 string::size_type space_loc = data.find_first_of(' ', base_loc+1); 00278 if(space_loc - charI < width) 00279 base_loc = space_loc; 00280 else if(base_loc == charI){ 00281 //word is too big for one line. split it. 00282 append_string = data.substr(charI, width); 00283 charI+=width; 00284 }else{ 00285 append_string = data.substr(charI, base_loc - charI); 00286 charI = base_loc; 00287 } 00288 } 00289 output_string += string(offset, ' ') + append_string; 00290 if(charI + width < data.length()) 00291 output_string += "\r\n"; 00292 } 00293 data = output_string; 00294 } 00295 00296 template< class SubSpec > 00297 void WriteHeader(gnMultiSpec< SubSpec >* spec, const string& hdr, ofstream& m_ofstream) { 00298 gnBaseHeader* gpbh = NULL; 00299 uint32 header_index = 0; 00300 try{ 00301 do{ 00302 gpbh = spec->GetHeader(hdr, header_index); 00303 m_ofstream << gpbh->GetHeader(); 00304 header_index++; 00305 }while(gpbh != NULL); 00306 }catch(gnException& gne){} 00307 } 00308 00309 boolean gnGBKSource::Write(gnSequence& seq, const string& filename){ 00310 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary); 00311 if(!m_ofstream.is_open()) 00312 return false; 00313 00314 string newline = "\r\n"; 00315 gnGenomeSpec* spec = seq.GetSpec(); 00316 00317 // output general file header first if one exists. 00318 if(spec->GetHeaderListLength() == 1){ 00319 gnBaseHeader *gpbh = spec->GetHeader(0); 00320 string name = gpbh->GetHeaderName(); 00321 //IMPLEMENT ME! Is platform specific newline substitution necessary? 00322 if(string::npos != name.find(".SEQ")){ 00323 string header = gpbh->GetHeader(); 00324 m_ofstream << header; 00325 } 00326 } 00327 // TODO: Figure out where the buffer overflow is and reduce 00328 // this back to BUFFER_SIZE -- overflow appears not to corrupt 00329 // sequence data 00330 Array<gnSeqC> array_buf( 2 * BUFFER_SIZE ); 00331 gnSeqC *bases = array_buf.data; 00332 00333 for(uint32 specI = 0; specI < spec->GetSpecListLength(); specI++){ 00334 gnFragmentSpec* subSpec = spec->GetSpec(specI); 00335 00336 //write out contig headers. start with LOCUS... 00337 m_ofstream << "LOCUS "; 00338 //write Locus Name 00339 string contigName = subSpec->GetName(); 00340 if(contigName.length() > SEQ_LOCUS_NAME_LENGTH) 00341 contigName = contigName.substr(0, SEQ_LOCUS_NAME_LENGTH); 00342 uint32 filler_size = SEQ_LOCUS_NAME_LENGTH - contigName.length(); 00343 m_ofstream << contigName << string(filler_size, ' '); 00344 //write Locus Length 00345 string length_string = uintToString(subSpec->GetLength()); 00346 filler_size = SEQ_LOCUS_SIZE_LENGTH - length_string.size(); 00347 m_ofstream << string(filler_size, ' ') << length_string << " bp "; 00348 //write dnatype 00349 string dnatype = string(SEQ_LOCUS_DNATYPE_LENGTH, ' '); 00350 uint32 head_look_i = 0; 00351 gnBaseHeader* gpbh = NULL; 00352 try{ 00353 gpbh = subSpec->GetHeader("LOCUS", head_look_i); 00354 }catch(gnException& gne){} 00355 if( gpbh != NULL ) 00356 dnatype = gpbh->GetHeader().substr(SEQ_LOCUS_DNATYPE_OFFSET, SEQ_LOCUS_DNATYPE_LENGTH); 00357 m_ofstream << dnatype << string(2, ' '); 00358 //write circularity 00359 string circular = subSpec->IsCircular() ? string("circular ") : string(10, ' '); 00360 m_ofstream << circular; 00361 //write division code 00362 string division = string(SEQ_LOCUS_DIVCODE_LENGTH, ' '); 00363 if(gpbh != NULL) 00364 division = gpbh->GetHeader().substr(SEQ_LOCUS_DIVCODE_OFFSET, SEQ_LOCUS_DIVCODE_LENGTH); 00365 m_ofstream << division; 00366 //write date -- IMPLEMENT ME! format the real date to spec! dd-mmm-yyyy 00367 string date = string(SEQ_LOCUS_DATE_LENGTH, ' '); 00368 if(gpbh != NULL) 00369 date = gpbh->GetHeader().substr(SEQ_LOCUS_DATE_OFFSET, SEQ_LOCUS_DATE_LENGTH); 00370 m_ofstream << string(7, ' ') << date << "\r\n"; 00371 00372 //write out the rest of the headers if they were supplied! 00373 WriteHeader(subSpec, "DEFINITION", m_ofstream); 00374 WriteHeader(subSpec, "ACCESSION", m_ofstream); 00375 WriteHeader(subSpec, "VERSION", m_ofstream); 00376 WriteHeader(subSpec, "KEYWORDS", m_ofstream); 00377 WriteHeader(subSpec, "SEGMENT", m_ofstream); 00378 WriteHeader(subSpec, "SOURCE", m_ofstream); 00379 WriteHeader(subSpec, "REFERENCE", m_ofstream); 00380 WriteHeader(subSpec, "COMMENT", m_ofstream); 00381 00382 //write out feature table! 00383 m_ofstream << "FEATURES Location/Qualifiers" << "\r\n"; 00384 for(uint32 featureI = 0; featureI < subSpec->GetFeatureListLength(); featureI++){ 00385 //write a feature tag 00386 gnBaseFeature *gpmf = subSpec->GetFeature(featureI); 00387 string featureName = gpmf->GetName(); 00388 m_ofstream << string(SEQ_SUBTAG_COLUMN, ' ') << featureName; 00389 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET - featureName.length() - SEQ_SUBTAG_COLUMN, ' '); 00390 //ready to output location b.s. 00391 uint32 location_count = gpmf->GetLocationListLength(); 00392 uint32 line_pos = SEQ_FEATURE_LOC_OFFSET; 00393 uint32 parenthesis_count = 0; 00394 if(location_count > 1){ 00395 m_ofstream << "join("; 00396 line_pos += 5; 00397 parenthesis_count++; 00398 } 00399 gnLocation::gnLocationType loc_type = gpmf->GetLocationType(); 00400 switch(loc_type){ 00401 case gnLocation::LT_Standard: 00402 break; 00403 case gnLocation::LT_Complement: 00404 m_ofstream << "complement("; 00405 line_pos += 11; 00406 parenthesis_count++; 00407 break; 00408 case gnLocation::LT_Order: 00409 m_ofstream << "order("; 00410 line_pos += 6; 00411 parenthesis_count++; 00412 break; 00413 case gnLocation::LT_Group: 00414 m_ofstream << "group("; 00415 parenthesis_count++; 00416 line_pos += 6; 00417 break; 00418 case gnLocation::LT_OneOf: 00419 m_ofstream << "one-of("; 00420 parenthesis_count++; 00421 line_pos += 7; 00422 break; 00423 default: 00424 break; 00425 } 00426 //create the location string, then see if it will fit on the line 00427 string location; 00428 for(uint32 locationI = 0; locationI < location_count; locationI++){ 00429 gnLocation gpl = gpmf->GetLocation(locationI); 00430 if(gpl.IsStartBoundLonger()) 00431 location += ">"; 00432 if(gpl.IsStartBoundShorter()) 00433 location += "<"; 00434 location += uintToString(gpl.GetStart()); 00435 gnSeqI end_loc = gpl.GetEnd(); 00436 if(end_loc != 0){ 00437 switch(gpl.GetType()){ 00438 case gnLocation::LT_BetweenBases: 00439 location += "^"; 00440 break; 00441 case gnLocation::LT_OneOf: 00442 location += "."; 00443 break; 00444 default: 00445 location += ".."; 00446 break; 00447 } 00448 if(gpl.IsEndBoundShorter()) 00449 location += "<"; 00450 if(gpl.IsEndBoundLonger()) 00451 location += ">"; 00452 location+= uintToString(end_loc); 00453 } 00454 if(locationI +1 < location_count) 00455 location += ","; 00456 else{ //append necessary parenthesis 00457 for(;parenthesis_count > 0; parenthesis_count--) 00458 location += ")"; 00459 } 00460 //put it on this line if it fits. otherwise make a new line. 00461 if(line_pos + location.length() < SEQ_COLUMN_WIDTH){ 00462 m_ofstream << location; 00463 line_pos += location.length(); 00464 }else{ 00465 m_ofstream << "\r\n" << string(SEQ_FEATURE_LOC_OFFSET, ' ') << location; 00466 line_pos = SEQ_FEATURE_LOC_OFFSET + location.length(); 00467 } 00468 location = ""; 00469 } 00470 m_ofstream << "\r\n"; 00471 //now output qualifiers! yaay! 00472 //god damn this is a big ugly piece of code. 00473 00474 uint32 qualifier_count = gpmf->GetQualifierListLength(); 00475 for(uint32 qualifierI = 0; qualifierI < qualifier_count; qualifierI++){ 00476 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET, ' '); 00477 gnBaseQualifier* qualifier = gpmf->GetQualifier(qualifierI); 00478 m_ofstream << "/" << qualifier->GetName() << "="; 00479 //IMPLEMENT ME! do a better word wrap on this bitch. 00480 string qually = string(qualifier->GetValue()); 00481 // FormatString(qually, SEQ_FEATURE_LOC_OFFSET, 80 - SEQ_FEATURE_LOC_OFFSET); 00482 // qually = qually.substr(SEQ_FEATURE_LOC_OFFSET); 00483 m_ofstream << qually << "\r\n"; 00484 } 00485 if(gpmf != NULL) 00486 delete gpmf; 00487 } 00488 00489 //get information about the sequence we're writing out. 00490 gnSeqI readOffset = seq.contigStart(specI); 00491 gnSeqI readLength = seq.contigLength(specI); 00492 00493 //finally - output base count and origin 00494 m_ofstream << "BASE COUNT "; 00495 gnSeqI a_count=0, c_count=0, g_count=0, t_count=0, other_count=0; 00496 gnSeqI countLen = readLength + readOffset; 00497 for(gnSeqI countI = readOffset; countI < countLen;){ 00498 gnSeqI writeLen = countLen - countI < BUFFER_SIZE ? countLen - countI : BUFFER_SIZE; 00499 if(!seq.ToArray(bases, writeLen, countI)) 00500 return false; 00501 gnSeqI a, c, g, t, other; 00502 BaseCount(string(bases, writeLen), a, c, g, t, other); 00503 a_count += a; 00504 c_count += c; 00505 g_count += g; 00506 t_count += t; 00507 other_count += other; 00508 countI += writeLen; 00509 } 00510 m_ofstream << uintToString(a_count) << " a "; 00511 m_ofstream << uintToString(c_count) << " c "; 00512 m_ofstream << uintToString(g_count) << " g "; 00513 m_ofstream << uintToString(t_count) << " t "; 00514 m_ofstream << uintToString(other_count) << " others" << "\r\n"; 00515 00516 string origin = "ORIGIN\r\n"; 00517 head_look_i = 0; 00518 try{ 00519 gpbh = subSpec->GetHeader("ORIGIN", head_look_i); 00520 origin = gpbh->GetHeader(); 00521 m_ofstream << origin; 00522 }catch(gnException& gne){ 00523 m_ofstream << "ORIGIN" << endl; 00524 } 00525 //write out the sequence 00526 gnSeqI contig_bases = 0; 00527 while(readLength > 0){ //buffer the read/writes 00528 gnSeqI writeLen = readLength < BUFFER_SIZE + 20 ? readLength : BUFFER_SIZE + 20; 00529 boolean success = seq.ToArray(bases, writeLen, readOffset); 00530 if(!success) 00531 return false; 00532 //print each 60 on their own lines... 00533 for(gnSeqI curbaseI = 0; curbaseI < writeLen; curbaseI += 60){ 00534 string baseIndexStr = uintToString(contig_bases + curbaseI +1); 00535 m_ofstream << string(SEQ_BASES_INDEX_END - baseIndexStr.length(), ' '); 00536 m_ofstream << baseIndexStr; 00537 for(gnSeqI base_offset = 0; base_offset <= 50; base_offset+=10){ 00538 if(writeLen <= curbaseI + base_offset) 00539 break; 00540 int64 print_length = writeLen - (curbaseI + base_offset); 00541 print_length = print_length > 10 ? 10 : print_length; 00542 m_ofstream << ' ' << string(bases + curbaseI + base_offset, print_length); 00543 } 00544 m_ofstream << "\r\n"; 00545 } 00546 readLength -= writeLen; 00547 readOffset += writeLen; 00548 contig_bases += writeLen; 00549 } 00550 m_ofstream << "//\r\n"; 00551 } 00552 00553 m_ofstream.close(); 00554 return true; 00555 } 00556 00557 gnFileContig* gnGBKSource::GetFileContig( const uint32 contigI ) const{ 00558 if(m_contigList.size() > contigI) 00559 return m_contigList[contigI]; 00560 return NULL; 00561 } 00562 00563 //File parsing access routine 00564 boolean gnGBKSource::ParseStream( istream& fin ) 00565 { 00566 // INIT temp varables 00567 uint32 readState = 0; 00568 uint32 lineStart = 0; 00569 // INIT buffer 00570 uint32 sectionStart = 0; 00571 uint64 streamPos = 0; 00572 uint64 bufReadLen = 0; 00573 uint64 remainingBuffer = 0; 00574 Array<char> array_buf( BUFFER_SIZE ); 00575 char* buf = array_buf.data; 00576 gnFragmentSpec* curFrag = 0; 00577 gnSourceSpec* curSpec = 0; 00578 gnSourceHeader *curHeader; 00579 gnFeature* curFeature; 00580 gnFileContig* curContig = 0; 00581 gnLocation::gnLocationType curBaseLocationType; 00582 gnSeqI curLocationStart; 00583 int32 curStartLength = 0; 00584 int32 curEndLength = 0; 00585 string curLocContig = ""; 00586 string curQualifierName; 00587 uint64 curQualifierStart; 00588 string curContigName = ""; 00589 gnSeqI seqLength = 0; 00590 gnSeqI seqChunk, seqChunkCount, gapChunk; 00591 boolean corruptWarning = false; 00592 00593 //decide what type of newlines we have 00594 DetermineNewlineType(); 00595 00596 m_spec = new gnGenomeSpec(); 00597 while( !fin.eof() ) 00598 { 00599 if(sectionStart > 0){ 00600 if(readState == 14) 00601 sectionStart = lineStart; 00602 remainingBuffer = bufReadLen - sectionStart; 00603 memmove(buf, buf+sectionStart, remainingBuffer); 00604 } 00605 // read chars 00606 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer); 00607 streamPos -= remainingBuffer; 00608 lineStart -= sectionStart; 00609 sectionStart = 0; 00610 bufReadLen = fin.gcount(); 00611 bufReadLen += remainingBuffer; 00612 00613 for( uint32 i=remainingBuffer ; i < bufReadLen ; i++ ) 00614 { 00615 char ch = buf[i]; 00616 switch( readState ) 00617 { 00618 case 0: //Assume we are in header at the start of a new line. 00619 //Look for keywords starting in column 1 00620 if((ch == '\n')&&(buf[lineStart] != ' ')&&(buf[lineStart] != ' ')){ //not equal to space or tab 00621 if(curSpec == NULL){ 00622 curSpec = new gnSourceSpec(this, m_spec->GetSpecListLength()); 00623 curFrag = new gnFragmentSpec(); 00624 curFrag->AddSpec(curSpec); 00625 curSpec->SetSourceName(m_openString); 00626 m_spec->AddSpec(curFrag); 00627 } 00628 if(lineStart != sectionStart){ //Add the previous header to our list 00629 uint32 j = SEQ_HEADER_NAME_LENGTH-1; 00630 for(; j > 0; j--) 00631 if((buf[sectionStart+j] != ' ')&&(buf[sectionStart+j] != ' ')) 00632 break; 00633 string header_name = string(buf+sectionStart, j+1); 00634 curHeader = new gnSourceHeader(this, header_name, sectionStart + streamPos, lineStart - sectionStart); 00635 //if this is header info _before_ a locus statement then its a general file header. 00636 if(strncmp(&buf[lineStart], "LOCUS", 5) == 0) 00637 m_spec->AddHeader(curHeader); 00638 else //otherwise its a fragment header. 00639 curFrag->AddHeader(curHeader); 00640 sectionStart = lineStart; 00641 } 00642 00643 if(strncmp(&buf[lineStart], "FEATURES", 8) == 0){ 00644 sectionStart = i + 1; 00645 readState = 1; //read in features 00646 }else if(strncmp(&buf[lineStart], "ORIGIN", 6) == 0){ 00647 curHeader = new gnSourceHeader(this, string("ORIGIN"), sectionStart + streamPos, i - sectionStart + 1); 00648 curFrag->AddHeader(curHeader); 00649 curContig = new gnFileContig(); 00650 curContig->SetName(curContigName); 00651 curContigName = ""; 00652 readState = 13; //read in base pairs 00653 }else if(strncmp(&buf[lineStart], "LOCUS", 5) == 0){ 00654 if(strncmp(&buf[lineStart+SEQ_LOCUS_CIRCULAR_COLUMN-1], "circular", 8) == 0) 00655 curFrag->SetCircular(true); 00656 uint32 j = SEQ_LOCUS_NAME_LENGTH+1; 00657 for(; j > 0; j--) 00658 if((buf[lineStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')&&(buf[sectionStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')) 00659 break; 00660 curContigName = string(buf+lineStart+SEQ_LOCUS_NAME_COLUMN-1, j+1); 00661 curFrag->SetName(curContigName); 00662 } 00663 } 00664 if(ch == '\n'){ 00665 lineStart = i + 1; 00666 } 00667 break; 00668 case 1: //look for feature tag in column six. ignore whitespace before feature. 00669 if((ch == ' ')||(ch == ' ')){ 00670 break; 00671 }else if(ch == '\n'){ 00672 lineStart = i + 1; 00673 sectionStart = i + 1; 00674 break; 00675 }else if(sectionStart == i){ //there was no whitespace, we hit a TAG instead 00676 i--; 00677 readState = 0; //Deal with a Header TAG 00678 sectionStart = i + 1; 00679 break; 00680 }else if((i - lineStart == SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1))){ 00681 sectionStart = i; 00682 readState = 2; 00683 } // 00684 case 2: //Get the feature name. stop on whitespace 00685 if((ch == ' ')||(ch == ' ')){ 00686 string featureName(buf+sectionStart, i - sectionStart); 00687 curFeature = new gnFeature(featureName); 00688 curFrag->AddFeature(curFeature); 00689 sectionStart = i + 1; 00690 readState = 3; 00691 } 00692 break; 00693 case 3: //Ignore whitespace before feature location 00694 if((ch == ' ')||(ch == ' ')){ 00695 break; 00696 }else if((ch == '\r')||(ch == '\n')){ 00697 lineStart = i+1; 00698 break; 00699 } 00700 sectionStart = i; 00701 readState = 4; 00702 // KNOWN BUG HERE! 00703 // if JOIN is outside a group of complemented coordinates the feature type 00704 // is incorrectly set to LT_COMPLEMENT. Instead each location's type should be set to LT_COMPLEMENT 00705 case 4: //Read a location start. stop on (<.:^ and whitespace 00706 if((ch == ' ')||(ch == ' ')||(ch == '(')||(ch == '.')||(ch=='^')||(ch==':')){ 00707 string starter(buf+sectionStart, i - sectionStart); 00708 if(ch == '('){ 00709 if(starter == "complement") 00710 curFeature->SetLocationType(gnLocation::LT_Complement); 00711 else if(starter == "order") 00712 curFeature->SetLocationType(gnLocation::LT_Order); 00713 else if(starter == "group") 00714 curFeature->SetLocationType(gnLocation::LT_Group); 00715 else if(starter == "one-of") 00716 curFeature->SetLocationType(gnLocation::LT_OneOf); 00717 sectionStart = i + 1; //ignore join since it is default. 00718 break; 00719 }else if(ch == ':'){ 00720 curLocContig = starter; 00721 sectionStart = i + 1; 00722 break; 00723 } 00724 curLocationStart = atoi(starter.c_str()); 00725 readState = 6; //read in end base by default. 00726 if(ch == '.'){ 00727 //go to special state to look for another one. 00728 readState = 5; 00729 sectionStart = i + 1; 00730 break; 00731 }else if(ch == '^'){ 00732 curBaseLocationType = gnLocation::LT_BetweenBases; 00733 }else if((ch == ' ')||(ch == ' ')){ 00734 //no end location go to qualifier 00735 gnLocation curLocation(curLocationStart, curLocationStart); 00736 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength()); 00737 readState = 7; 00738 } 00739 sectionStart = i + 1; 00740 00741 }else if(ch == '<'){ 00742 curStartLength = -1; 00743 sectionStart = i + 1; 00744 }else if(ch == '>'){ 00745 curStartLength = 1; 00746 sectionStart = i + 1; 00747 } 00748 break; 00749 case 5: //look for another period or location start. 00750 if(ch == '.'){ 00751 curBaseLocationType = gnLocation::LT_Standard; 00752 readState = 6; 00753 sectionStart = i + 1; 00754 break; 00755 } 00756 curBaseLocationType = gnLocation::LT_OneOf; 00757 case 6: //see if there's a second location value. stop on >, and whitespace 00758 if(ch == '>'){ 00759 curEndLength = 1; 00760 sectionStart = i + 1; 00761 }else if(ch == '<'){ 00762 curEndLength = -1; 00763 sectionStart = i + 1; 00764 }else if((ch == ' ')||(ch == ' ')||(ch == ',')){ 00765 //read end location 00766 string ender(buf+sectionStart, i - sectionStart); 00767 gnSeqI curLocationEnd = atoi(ender.c_str()); 00768 gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType); 00769 curEndLength = 0; 00770 curStartLength = 0; 00771 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength()); 00772 readState = ch == ',' ? 3 : 7; //read another loc if we need to. 00773 sectionStart = i+1; 00774 } 00775 break; 00776 case 7: //skip to start of qualifier 00777 if((ch != ' ')&&(ch != ' ')&&(lineStart == i)){ 00778 sectionStart = i; // Hit a tag. go deal with it. 00779 readState = 0; 00780 i--; 00781 }else if((ch != ' ')&&(ch != ' ')&&((lineStart == i - SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1)))){ 00782 sectionStart = i; // Hit a feature. go deal with it. 00783 readState = 2; 00784 i--; 00785 }else if(ch == ','){ //oops! another location to read! 00786 sectionStart = i+1; 00787 readState = 3; 00788 }else if(ch == '/'){ //finally, a qualifier. 00789 sectionStart = i+1; 00790 readState = 8; 00791 }else if(ch == '\n') 00792 lineStart = i + 1; 00793 break; 00794 case 8: //get a qualifier, stop on = 00795 if(ch == '='){ 00796 curQualifierName = string(buf+sectionStart, i - sectionStart); 00797 readState = 9; 00798 sectionStart = i+1; 00799 }else if( ch == '\r' || ch == '\n' ){ 00800 // this is a value-less qualifier 00801 curQualifierName = string(buf+sectionStart, i - sectionStart); 00802 curFeature->AddQualifier( new gnStringQualifier( curQualifierName, "" )); 00803 readState = 7; 00804 sectionStart = i+1; 00805 } 00806 break; 00807 case 9: //are we getting a string? look for " or [ 00808 if(ch == '"'){ 00809 readState = 10; 00810 sectionStart = i; 00811 curQualifierStart = i + streamPos; 00812 }else if(ch == '['){ 00813 readState = 11; 00814 sectionStart = i; 00815 }else if((ch == '\r')||(ch == '\n')){ 00816 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart)); 00817 sectionStart = i+1; 00818 readState = 7; //look for another qualifier 00819 } 00820 break; 00821 case 10: //read until the end of the quotation. look out for escaped quotes 00822 if(ch == '"') 00823 readState = 11; 00824 if(ch == '\n'){ 00825 lineStart = i + 1; 00826 } 00827 break; 00828 case 11: 00829 if(ch != '"'){ 00830 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, curQualifierStart, i - sectionStart)); 00831 sectionStart = i+1; 00832 readState = 7; //look for another qualifier. 00833 if(ch == '\n') 00834 lineStart = i + 1; 00835 }else 00836 readState = 10; //quote was escaped. look for another. 00837 break; 00838 case 12: 00839 if(ch == ']'){ 00840 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart)); 00841 sectionStart = i+1; 00842 readState = 7; //look for another qualifier. 00843 } 00844 break; 00845 case 13: //start the sequence read. 00846 curContig->SetSectStart(gnContigSequence, i - 1 + streamPos); 00847 curContig->SetRepeatSeqGap(true); 00848 seqChunk = 0; 00849 seqChunkCount = 0; 00850 gapChunk = m_newlineSize + 1; 00851 readState = 14; 00852 break; 00853 case 14: 00854 while(i < bufReadLen){ 00855 ch = buf[i]; 00856 if((ch == '/')&&(i==lineStart)){ 00857 readState = 15; //end of this sequence 00858 break; 00859 }else if(m_pFilter->IsValid(ch)){ 00860 if(gapChunk > 0){ 00861 if((gapChunk > 1 && seqChunkCount > 0) || 00862 (gapChunk != 10 + m_newlineSize && seqChunkCount == 0)){ 00863 if( !corruptWarning ){ 00864 ErrorMsg("File is corrupt. Proceed with caution."); 00865 corruptWarning = true; 00866 } 00867 curContig->SetRepeatSeqGap(false); 00868 } 00869 gapChunk = 0; 00870 } 00871 seqChunk++; 00872 seqLength++; 00873 }else{ 00874 gapChunk++; 00875 if(seqChunk == 10){ 00876 seqChunk = 0; 00877 seqChunkCount++; 00878 if(seqChunkCount == 6){ 00879 //got a complete line. start over 00880 seqChunkCount = 0; 00881 } 00882 } 00883 if(ch == '\n') 00884 lineStart = i + 1; 00885 } 00886 i++; 00887 } 00888 break; 00889 case 15: 00890 if((ch == '\n')&&(buf[lineStart+1] == '/')){ 00891 curContig->SetSectEnd(gnContigSequence, lineStart - m_newlineSize + streamPos); 00892 curContig->SetSeqLength(seqLength); 00893 m_contigList.push_back(curContig); 00894 curContig = 0; 00895 curSpec->SetLength(seqLength); 00896 curSpec = 0; 00897 seqLength = 0; 00898 lineStart = i + 1; 00899 sectionStart = i + 1; 00900 readState = 0; 00901 } 00902 break; 00903 } 00904 } 00905 streamPos += bufReadLen; 00906 } 00907 if(curContig != 0){ 00908 curContig->SetSectEnd(gnContigSequence, streamPos - 1); 00909 curContig->SetSeqLength(seqLength); 00910 m_contigList.push_back(curContig); 00911 curSpec->SetLength(seqLength); 00912 } 00913 if(curSpec != 0) 00914 if((curFrag->GetFeatureListLength() == 0) && (curFrag->GetHeaderListLength() == 0) 00915 &&(curSpec->GetLength() == 0)){ 00916 m_spec->RemoveSpec(m_spec->GetSpecListLength() - 1); 00917 delete curFrag; 00918 } 00919 m_ifstream.clear(); 00920 return true; 00921 }

Generated on Mon Feb 14 19:28:20 2005 for libGenome by doxygen 1.3.8