src/gnFASSource.cpp

Go to the documentation of this file.
00001 00002 // File: gnFASSource.h 00003 // Purpose: Implements gnBaseSource for .FAS files 00004 // Description: 00005 // Changes: 00006 // Version: libGenome 0.5.1 00007 // Author: Aaron Darling 00008 // Modified by: 00009 // Copyright: (c) Aaron Darling 00010 // Licenses: See COPYING file for details 00012 #include "gn/gnFASSource.h" 00013 #include "gn/gnBaseSpec.h" 00014 #include "gn/gnFilter.h" 00015 #include "gn/gnStringTools.h" 00016 #include "gn/gnSourceSpec.h" 00017 #include "gn/gnSourceHeader.h" 00018 #include "gn/gnDebug.h" 00019 00020 gnFASSource::gnFASSource() 00021 { 00022 m_openString = ""; 00023 m_pFilter = gnFilter::fullDNASeqFilter(); 00024 if(m_pFilter == NULL){ 00025 DebugMsg("Error using static sequence filters."); 00026 } 00027 } 00028 gnFASSource::gnFASSource( const gnFASSource& s ) : gnFileSource(s) 00029 { 00030 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin(); 00031 for( ; iter != s.m_contigList.end(); ++iter ) 00032 { 00033 m_contigList.push_back( (*iter)->Clone() ); 00034 } 00035 } 00036 gnFASSource::~gnFASSource() 00037 { 00038 m_ifstream.close(); 00039 vector< gnFileContig* >::iterator iter = m_contigList.begin(); 00040 for( ; iter != m_contigList.end(); ++iter ) 00041 { 00042 gnFileContig* fg = *iter; 00043 *iter = 0; 00044 delete fg; 00045 } 00046 } 00047 boolean gnFASSource::HasContig( const string& nameStr ) const 00048 { 00049 for(uint32 i = 0 ; i <= m_contigList.size(); i++ ) 00050 { 00051 if( nameStr == m_contigList[i]->GetName() ) 00052 return true; 00053 } 00054 return false; 00055 } 00056 uint32 gnFASSource::GetContigID( const string& name ) const 00057 { 00058 for(uint32 i = 0 ; i <= m_contigList.size(); i++ ) 00059 { 00060 if( name == m_contigList[i]->GetName() ) 00061 return i; 00062 } 00063 return ALL_CONTIGS; 00064 } 00065 string gnFASSource::GetContigName( const uint32 i ) const 00066 { 00067 if( i < m_contigList.size() ){ 00068 return m_contigList[i]->GetName(); 00069 } 00070 return ""; 00071 } 00072 gnSeqI gnFASSource::GetContigSeqLength( const uint32 i ) const 00073 { 00074 if( i < m_contigList.size() ){ 00075 return m_contigList[i]->GetSeqLength(); 00076 }else if( i == ALL_CONTIGS){ 00077 gnSeqI seqlen = 0; 00078 for(uint32 j=0; j < m_contigList.size(); j++) 00079 seqlen += m_contigList[j]->GetSeqLength(); 00080 return seqlen; 00081 } 00082 return GNSEQI_ERROR; 00083 } 00084 gnFileContig* gnFASSource::GetContig( const uint32 i ) const 00085 { 00086 if( i < m_contigList.size() ){ 00087 return m_contigList[i]; 00088 } 00089 return 0; 00090 } 00091 00092 boolean gnFASSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI ) 00093 { 00094 m_ifstream.clear(); 00095 uint32 contigIndex = contigI; 00096 uint64 startPos = 0; 00097 uint64 readableBytes = 0; 00098 if( !SeqSeek( start, contigIndex, startPos, readableBytes ) ){ 00099 bufLen = 0; 00100 return false; 00101 } 00102 00103 if( contigI == ALL_CONTIGS ){ 00104 uint32 curLen = 0; 00105 uint64 bytesRead = 0; 00106 while (curLen < bufLen){ 00107 //SeqSeek to start, Figure out how much can be read before SeqSeeking again. 00108 if(readableBytes <= 0) //Look out for zero length contigs! IMPLEMENT ME 00109 if( !SeqSeek( start + curLen, contigIndex, startPos, readableBytes ) ){ 00110 bufLen = curLen; 00111 return true; 00112 } 00113 //readLen is the amount to read on this pass 00114 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes; 00115 Array<gnSeqC> array_buf( readLen ); 00116 gnSeqC* tmpBuf = array_buf.data; //read into tmpBuf, then filter tmpBuf into curBuf 00117 00118 // read chars and filter 00119 m_ifstream.read(tmpBuf, readLen); 00120 uint64 gc = m_ifstream.gcount(); 00121 bytesRead += gc; 00122 readableBytes -= gc; 00123 // cout << "Read " << gc << " chars from " << m_openString << "\n"; 00124 // cout << "Checking character validity on: " << tmpBuf << "\n"; 00125 for(uint32 i=0; i < gc; i++){ 00126 if( m_pFilter->IsValid(tmpBuf[i]) ){ 00127 buf[curLen] = tmpBuf[i]; 00128 curLen++; 00129 } 00130 } 00131 if(m_ifstream.eof()){ //we hit the end of the file. bail out. 00132 m_ifstream.clear(); 00133 bufLen = curLen; 00134 return true; 00135 } 00136 } 00137 bufLen = curLen; 00138 } 00139 else if( contigI < m_contigList.size() ) 00140 { 00141 uint32 curLen = 0; 00142 //check to see if the buffer is bigger than the contig. if so truncate it. 00143 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength(); 00144 bufLen = bufLen < contigSize ? bufLen : contigSize; 00145 while (curLen < bufLen) 00146 { 00147 uint64 readLen = bufLen - curLen; //the amount to read on this pass 00148 Array<gnSeqC> array_buf( readLen ); // use Array template to ensure proper deallocation 00149 gnSeqC* tmpBuf = array_buf.data; //read into tmpBuf, then filter tmpBuf into curBuf 00150 00151 // read chars and filter 00152 m_ifstream.read(tmpBuf, readLen); 00153 uint64 gc = m_ifstream.gcount(); 00154 // cout << "Read " << gc << " chars from " << m_openString << "\n"; 00155 // cout << "Checking character validity on: " << tmpBuf << "\n"; 00156 for(uint32 i=0; i < gc; i++){ 00157 if( m_pFilter->IsValid(tmpBuf[i]) ){ 00158 buf[curLen] = tmpBuf[i]; 00159 curLen++; 00160 } 00161 } 00162 if(m_ifstream.eof()){ //we hit the end of the file. bail out. 00163 m_ifstream.clear(); 00164 bufLen = curLen; 00165 return true; 00166 } 00167 } 00168 bufLen = curLen; 00169 } 00170 return true; 00171 } 00172 00173 00174 // private: 00175 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig 00176 // returns startPos, the file offset where the sequence starts 00177 // returns true if successful, false otherwise 00178 boolean gnFASSource::SeqSeek( const gnSeqI start, const uint32 contigI, uint64& startPos, uint64& readableBytes ) 00179 { 00180 if( contigI == ALL_CONTIGS ){ 00181 // find first contig 00182 gnSeqI curIndex = 0; 00183 vector< gnFileContig* >::iterator iter = m_contigList.begin(); 00184 for( ; iter != m_contigList.end(); ++iter ) 00185 { 00186 uint64 len = (*iter)->GetSeqLength(); 00187 if( (curIndex + len) > start ) 00188 break; 00189 curIndex += len; 00190 } 00191 if( iter == m_contigList.end() ) 00192 return false; 00193 // seek to start 00194 gnSeqI startIndex = start - curIndex; //startIndex is starting pos. within the contig 00195 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes ); 00196 } 00197 else if( contigI < m_contigList.size() ) 00198 { 00199 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes ); 00200 } 00201 return false; 00202 } 00203 00204 //Returns startPos, the file offset where the sequence starts. 00205 boolean gnFASSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ){ 00206 readableBytes = 0; 00207 uint32 curLen = 0; 00208 //seek to the file offset where the contig starts 00209 startPos = contig.GetSectStartEnd(gnContigSequence).first; //set startPos to start where the contig starts 00210 00211 if(contig.HasRepeatSeqGap()) 00212 if(contig.GetRepeatSeqGapSize().first > 0) 00213 if(contig.GetRepeatSeqGapSize().second > 0){ 00214 //check this 00215 startPos += start + (start/contig.GetRepeatSeqGapSize().first)*contig.GetRepeatSeqGapSize().second; 00216 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00217 m_ifstream.seekg( startPos, ios::beg ); 00218 return true; 00219 } 00220 m_ifstream.seekg( startPos, ios::beg ); 00221 if( m_ifstream.eof() ){ 00222 ErrorMsg("ERROR in gnFASSource::Incorrect contig start position, End of file reached!\n"); 00223 return false; 00224 } 00225 while( true ){ 00226 // READ the rest of the contig skipping over invalid characters until we get to the starting base pair. 00227 // startPos will contain the file offset with the starting base pair 00228 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00229 if(tmpbufsize == 0){ 00230 ErrorMsg("ERROR in gnFASSource: stored contig size is incorrect.\n"); 00231 return false; 00232 } 00233 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE; //read in the smaller of the two. 00234 Array<char> array_buf( tmpbufsize ); // use Array template to ensure proper deallocation 00235 char *tmpbuf = array_buf.data; 00236 m_ifstream.read( tmpbuf, tmpbufsize ); 00237 if( m_ifstream.eof() ){ 00238 ErrorMsg("ERROR in gnFASSource::Read End of file reached!\n"); 00239 return false; 00240 } 00241 for( uint32 i=0; i < tmpbufsize; ++i ){ 00242 if( m_pFilter->IsValid(tmpbuf[i]) ){ 00243 if( curLen >= start ){ //stop when we reach the starting offset within this contig 00244 startPos += i; 00245 m_ifstream.seekg( startPos, ios::beg ); //seek to startPos 00246 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos; 00247 return true; 00248 } 00249 ++curLen; //each time we read a valid b.p., increment the sequence length 00250 } 00251 } 00252 startPos += tmpbufsize; 00253 } 00254 return true; 00255 } 00256 00257 //write out the given source in this file format. 00258 boolean gnFASSource::Write(gnBaseSource *source, const string& filename){ 00259 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary); 00260 if(!m_ofstream.is_open()) 00261 return false; 00262 uint32 contigCount = source->GetContigListLength(); 00263 for(uint32 contigI = 0; contigI < contigCount; contigI++){ 00264 string contigName = source->GetContigName(contigI); 00265 m_ofstream << ">" << contigName << ";\n"; 00266 gnSeqI seqLength = source->GetContigSeqLength(contigI); 00267 while(seqLength > 0){ 00268 gnSeqI writeLen = seqLength < BUFFER_SIZE ? seqLength : BUFFER_SIZE; 00269 Array<gnSeqC> array_buf( writeLen+1 ); // use Array template to ensure proper deallocation 00270 gnSeqC *bases = array_buf.data; 00271 boolean success = source->SeqRead(0, bases, writeLen, contigI); 00272 if(!success) 00273 return false; 00274 bases[writeLen] = '\0'; 00275 m_ofstream << bases << "\n"; 00276 seqLength -= writeLen; 00277 } 00278 } 00279 m_ofstream.close(); 00280 return true; 00281 } 00282 00283 //write out the given sequence in this file format. 00284 void gnFASSource::Write(gnSequence& seq, const string& filename, boolean write_coords, boolean enforce_unique_names){ 00285 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary); 00286 if(!m_ofstream.is_open()) 00287 Throw_gnEx(FileNotOpened()); 00288 Write(seq, m_ofstream, write_coords, enforce_unique_names); 00289 m_ofstream.close(); 00290 } 00291 00292 void gnFASSource::Write(gnSequence& seq, ostream& m_ostream, boolean write_coords, boolean enforce_unique_names){ 00293 vector<string> contigNameList; 00294 Array<gnSeqC> array_buf( BUFFER_SIZE ); // use Array template to ensure proper deallocation 00295 gnSeqC *bases = array_buf.data; 00296 gnGenomeSpec* spec = seq.GetSpec(); 00297 00298 //set the newline type. 00299 string newline = "\r\n"; 00300 gnSeqI readOffset = 1; 00301 00302 for(uint32 fragI = 0; fragI < seq.contigListLength(); fragI++){ 00303 //write out contig name and header 00304 string contigName = seq.contigName(fragI); 00305 00306 if(enforce_unique_names){ 00307 uint32 name_count = 0; 00308 for(uint32 i=0; i < contigNameList.size(); i++) 00309 if(contigNameList[i] == contigName) 00310 name_count++; 00311 contigNameList.push_back(contigName); 00312 if(name_count > 0) 00313 contigName += "_" + uintToString(name_count); 00314 } 00315 00316 gnFragmentSpec* subSpec = spec->GetSpec(fragI); 00317 gnBaseHeader *gpbh = subSpec->GetHeader(0); 00318 string header = ""; 00319 if(gpbh != NULL){ 00320 header = gpbh->GetHeader(); 00321 //delete everything after the first newline. 00322 uint32 newlinepos = header.find_first_of('\n', 0); 00323 if(newlinepos != string::npos){ 00324 if(header[newlinepos-1] == '\r') 00325 newlinepos--; 00326 header = header.substr(0, newlinepos); 00327 } 00328 } //IMPLEMENT ME!! Does header line need to be < 80 chars? 00329 00330 //write out the sequence 00331 gnSeqI readLength = seq.contigLength(fragI); 00332 m_ostream << ">" << contigName; 00333 if(write_coords) 00334 m_ostream << " " << "(" << readOffset << ", " << readOffset + readLength - 1 << ")"; 00335 m_ostream << "; " << header << newline; 00336 00337 gnSeqI linePos = 0; 00338 while(readLength > 0){ //buffer the read/writes 00339 gnSeqI read_chunk_size = (BUFFER_SIZE / FAS_LINE_WIDTH) * FAS_LINE_WIDTH; 00340 gnSeqI writeLen = readLength < read_chunk_size ? readLength : read_chunk_size; 00341 00342 if(!seq.ToArray(bases, writeLen, readOffset)) 00343 return; 00344 for(gnSeqI curbase = 0; curbase < writeLen; curbase += FAS_LINE_WIDTH){ 00345 gnSeqI writeout_size = writeLen - curbase < FAS_LINE_WIDTH ? writeLen - curbase : FAS_LINE_WIDTH; 00346 m_ostream << string(bases + curbase, writeout_size) << newline; 00347 } 00348 readLength -= writeLen; 00349 readOffset += writeLen; 00350 } 00351 if(linePos != 0) 00352 m_ostream << newline; 00353 } 00354 00355 } 00356 00357 gnGenomeSpec *gnFASSource::GetSpec() const{ 00358 gnGenomeSpec *spec = new gnGenomeSpec(); 00359 for(uint32 i=0; i < m_contigList.size(); i++){ 00360 //create specs for the fragment and contig levels 00361 gnFragmentSpec *fragmentSpec = new gnFragmentSpec(); 00362 gnSourceSpec *contigSpec = new gnSourceSpec((gnBaseSource*)this, i); 00363 //set up the spec tree structure 00364 spec->AddSpec(fragmentSpec, i); 00365 fragmentSpec->AddSpec(contigSpec); 00366 00367 fragmentSpec->SetName(m_contigList[i]->GetName()); 00368 fragmentSpec->SetSourceName(m_openString); 00369 contigSpec->SetName(m_contigList[i]->GetName()); 00370 contigSpec->SetSourceName(m_openString); 00371 00372 pair<uint32, uint32> headsect = m_contigList[i]->GetSectStartEnd(gnContigHeader); 00373 if(headsect.first != headsect.second){ 00374 gnSourceHeader *gpsh = new gnSourceHeader((gnBaseSource *)this, string(""), headsect.first, headsect.second - headsect.first); 00375 fragmentSpec->AddHeader(gpsh, 0); 00376 } 00377 } 00378 return spec; 00379 } 00380 00381 gnFileContig* gnFASSource::GetFileContig( const uint32 contigI ) const{ 00382 if(m_contigList.size() > contigI) 00383 return m_contigList[contigI]; 00384 return NULL; 00385 } 00386 00387 boolean gnFASSource::ParseStream( istream& fin ) 00388 { 00389 // INIT temp varables 00390 uint32 readState = 0; 00391 gnFileContig* currentContig = 0; 00392 string nameFStr; 00393 uint64 seqLength = 0, gapLength = 0; 00394 // INIT buffer 00395 uint64 streamPos = 0; 00396 uint64 bufReadLen = 0; 00397 Array<gnSeqC> array_buf( BUFFER_SIZE ); // use Array template to ensure proper deallocation 00398 char* buf = array_buf.data; 00399 boolean paren_hit = false; 00400 uint32 repeatSeqSize = 0; 00401 boolean corrupt_msg = false; 00402 00403 //decide what type of newlines we have 00404 DetermineNewlineType(); 00405 00406 00407 while( !fin.eof() ) 00408 { 00409 // read chars 00410 fin.read( buf, BUFFER_SIZE); 00411 streamPos += bufReadLen; 00412 bufReadLen = fin.gcount(); 00413 for( uint32 i=0 ; i < bufReadLen ; i++ ) 00414 { 00415 char ch = buf[i]; 00416 switch( readState ) 00417 { 00418 case 0: // CHECK file validity 00419 if( !((buf[0] == '>') || !m_pFilter->IsValid(buf[0])) ) 00420 { 00421 return false; // NOT a .fas file 00422 } 00423 readState = 1; 00424 case 1: // BRANCH 00425 if( ch == '>' ) 00426 { 00427 seqLength = 0; gapLength = 0; // reset count 00428 currentContig = new gnFileContig(); 00429 currentContig->SetFileStart( streamPos + i ); 00430 currentContig->SetRepeatSeqGap(true); 00431 currentContig->SetRepeatSeqSize( repeatSeqSize ); 00432 currentContig->SetRepeatGapSize( m_newlineSize ); 00433 readState = 2; 00434 paren_hit = false; 00435 nameFStr = ""; 00436 } 00437 else 00438 ++gapLength; 00439 break; 00440 case 2: // > CONTIG NAME 00441 if( isNewLine(ch) || ch == ';') 00442 { 00443 currentContig->SetName( nameFStr ); 00444 currentContig->SetSectStart( gnContigHeader, streamPos + i + 1 ); 00445 if( ch == ';' ) 00446 readState = 3; 00447 else{ 00448 readState = 4; 00449 if(ch == '\r') 00450 currentContig->SetSectStart( gnContigHeader, streamPos + i + 2 ); 00451 } 00452 } 00453 else if( ch == '(' ){ 00454 if(isSpace(buf[i-1])){ 00455 //delete the last char in nameFStr 00456 nameFStr = nameFStr.substr(0, nameFStr.length()-1); 00457 } 00458 paren_hit = true; 00459 }else if((!isSpace(ch) || nameFStr.length()) && !paren_hit ) 00460 nameFStr += ch; 00461 break; 00462 case 3: // >... ; REMARK 00463 if( isNewLine(ch) ) 00464 readState = 4; 00465 break; 00466 case 4: // >... ; /n NEWLINE BRANCH 00467 if( ch == '>' ) 00468 { 00469 readState = 3; 00470 } 00471 else if( m_pFilter->IsValid(ch) ) 00472 { 00473 currentContig->SetSectEnd( gnContigHeader, streamPos + i ); 00474 currentContig->SetSectStart( gnContigSequence, streamPos + i); 00475 seqLength = 1; gapLength = 0; 00476 readState = 5; 00477 } 00478 break; 00479 case 5: // SEQUENCE 00480 while(i < bufReadLen){ 00481 ch = buf[i]; 00482 if( m_pFilter->IsValid(ch) ){ 00483 if(gapLength > 0){ 00484 if(seqLength != repeatSeqSize){ 00485 if( !corrupt_msg ){ 00486 corrupt_msg = true; 00487 ErrorMsg( "Sequence file appears corrupt, proceeding with caution\n" ); 00488 } 00489 currentContig->SetRepeatSeqGap(false); 00490 }if(gapLength != m_newlineSize){ 00491 //file is corrupt, can't do jumps. 00492 if( !corrupt_msg ){ 00493 corrupt_msg = true; 00494 ErrorMsg( "Sequence file appears corrupt, proceeding with caution\n" ); 00495 } 00496 currentContig->SetRepeatSeqGap(false); 00497 } 00498 currentContig->AddToSeqLength( seqLength ); 00499 seqLength = 0; 00500 gapLength = 0; 00501 } 00502 seqLength++; 00503 }else if( ch == '>'){ 00504 currentContig->AddToSeqLength( seqLength ); 00505 currentContig->SetSectEnd( gnContigSequence, streamPos + i - 1); 00506 currentContig->SetFileEnd( streamPos + i - 1 ); 00507 m_contigList.push_back(currentContig); 00508 readState = 1; 00509 i--; 00510 break; 00511 } 00512 else if( isNewLine(ch) ){ 00513 if( repeatSeqSize == 0 ){ 00514 repeatSeqSize = seqLength; 00515 currentContig->SetRepeatSeqSize( repeatSeqSize ); 00516 } 00517 gapLength++; 00518 } 00519 else{ 00520 // Error cannot do nice jumps 00521 currentContig->SetRepeatSeqGap(false); 00522 if( !corrupt_msg ){ 00523 corrupt_msg = true; 00524 ErrorMsg( "Sequence file appears corrupt, proceeding with caution\n" ); 00525 } 00526 } 00527 i++; 00528 } 00529 break; 00530 default: 00531 ErrorMsg("ERROR"); 00532 return false; 00533 break; 00534 } 00535 }// for all buf 00536 }// while !eof 00537 // CLEAN UP 00538 if( currentContig != 0 ) 00539 { 00540 if( readState == 2 ) 00541 { 00542 currentContig->SetName( nameFStr ); 00543 } 00544 if( (readState >= 2) && (readState < 5) ) 00545 { 00546 currentContig->SetSectEnd( gnContigHeader, streamPos + bufReadLen ); 00547 } 00548 else if( readState == 5 ) 00549 { 00550 currentContig->AddToSeqLength( seqLength ); 00551 currentContig->SetSectEnd( gnContigSequence, streamPos + bufReadLen ); 00552 } 00553 currentContig->SetFileEnd( streamPos + bufReadLen ); 00554 m_contigList.push_back(currentContig); 00555 } 00556 m_ifstream.clear(); 00557 return true; 00558 }

Generated on Mon Feb 14 19:28:19 2005 for libGenome by doxygen 1.3.8