00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00012
#include "gn/gnFASSource.h"
00013
#include "gn/gnBaseSpec.h"
00014
#include "gn/gnFilter.h"
00015
#include "gn/gnStringTools.h"
00016
#include "gn/gnSourceSpec.h"
00017
#include "gn/gnSourceHeader.h"
00018
#include "gn/gnDebug.h"
00019
00020 gnFASSource::gnFASSource()
00021 {
00022 m_openString =
"";
00023 m_pFilter =
gnFilter::fullDNASeqFilter();
00024
if(m_pFilter == NULL){
00025
DebugMsg(
"Error using static sequence filters.");
00026 }
00027 }
00028 gnFASSource::gnFASSource(
const gnFASSource& s ) :
gnFileSource(s)
00029 {
00030 vector< gnFileContig* >::const_iterator iter = s.
m_contigList.begin();
00031
for( ; iter != s.
m_contigList.end(); ++iter )
00032 {
00033
m_contigList.push_back( (*iter)->Clone() );
00034 }
00035 }
00036 gnFASSource::~gnFASSource()
00037 {
00038 m_ifstream.close();
00039 vector< gnFileContig* >::iterator iter =
m_contigList.begin();
00040
for( ; iter !=
m_contigList.end(); ++iter )
00041 {
00042
gnFileContig* fg = *iter;
00043 *iter = 0;
00044
delete fg;
00045 }
00046 }
00047 boolean gnFASSource::HasContig(
const string& nameStr )
const
00048
{
00049
for(
uint32 i = 0 ; i <=
m_contigList.size(); i++ )
00050 {
00051
if( nameStr ==
m_contigList[i]->GetName() )
00052
return true;
00053 }
00054
return false;
00055 }
00056 uint32 gnFASSource::GetContigID(
const string& name )
const
00057
{
00058
for(
uint32 i = 0 ; i <=
m_contigList.size(); i++ )
00059 {
00060
if( name ==
m_contigList[i]->GetName() )
00061
return i;
00062 }
00063
return ALL_CONTIGS;
00064 }
00065 string
gnFASSource::GetContigName(
const uint32 i )
const
00066
{
00067
if( i <
m_contigList.size() ){
00068
return m_contigList[i]->GetName();
00069 }
00070
return "";
00071 }
00072 gnSeqI gnFASSource::GetContigSeqLength(
const uint32 i )
const
00073
{
00074
if( i <
m_contigList.size() ){
00075
return m_contigList[i]->GetSeqLength();
00076 }
else if( i == ALL_CONTIGS){
00077
gnSeqI seqlen = 0;
00078
for(
uint32 j=0; j <
m_contigList.size(); j++)
00079 seqlen +=
m_contigList[j]->GetSeqLength();
00080
return seqlen;
00081 }
00082
return GNSEQI_ERROR;
00083 }
00084 gnFileContig*
gnFASSource::GetContig(
const uint32 i )
const
00085
{
00086
if( i <
m_contigList.size() ){
00087
return m_contigList[i];
00088 }
00089
return 0;
00090 }
00091
00092 boolean gnFASSource::SeqRead(
const gnSeqI start,
char* buf,
gnSeqI& bufLen,
const uint32 contigI )
00093 {
00094 m_ifstream.clear();
00095
uint32 contigIndex = contigI;
00096
uint64 startPos = 0;
00097
uint64 readableBytes = 0;
00098
if( !
SeqSeek( start, contigIndex, startPos, readableBytes ) ){
00099 bufLen = 0;
00100
return false;
00101 }
00102
00103
if( contigI == ALL_CONTIGS ){
00104
uint32 curLen = 0;
00105
uint64 bytesRead = 0;
00106
while (curLen < bufLen){
00107
00108
if(readableBytes <= 0)
00109
if( !
SeqSeek( start + curLen, contigIndex, startPos, readableBytes ) ){
00110 bufLen = curLen;
00111
return true;
00112 }
00113
00114
uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00115 Array<gnSeqC> array_buf( readLen );
00116
gnSeqC* tmpBuf = array_buf.data;
00117
00118
00119 m_ifstream.read(tmpBuf, readLen);
00120
uint64 gc = m_ifstream.gcount();
00121 bytesRead += gc;
00122 readableBytes -= gc;
00123
00124
00125
for(
uint32 i=0; i < gc; i++){
00126
if( m_pFilter->
IsValid(tmpBuf[i]) ){
00127 buf[curLen] = tmpBuf[i];
00128 curLen++;
00129 }
00130 }
00131
if(m_ifstream.eof()){
00132 m_ifstream.clear();
00133 bufLen = curLen;
00134
return true;
00135 }
00136 }
00137 bufLen = curLen;
00138 }
00139
else if( contigI <
m_contigList.size() )
00140 {
00141
uint32 curLen = 0;
00142
00143
gnSeqI contigSize =
m_contigList[contigI]->GetSeqLength();
00144 bufLen = bufLen < contigSize ? bufLen : contigSize;
00145
while (curLen < bufLen)
00146 {
00147
uint64 readLen = bufLen - curLen;
00148 Array<gnSeqC> array_buf( readLen );
00149
gnSeqC* tmpBuf = array_buf.data;
00150
00151
00152 m_ifstream.read(tmpBuf, readLen);
00153
uint64 gc = m_ifstream.gcount();
00154
00155
00156
for(
uint32 i=0; i < gc; i++){
00157
if( m_pFilter->
IsValid(tmpBuf[i]) ){
00158 buf[curLen] = tmpBuf[i];
00159 curLen++;
00160 }
00161 }
00162
if(m_ifstream.eof()){
00163 m_ifstream.clear();
00164 bufLen = curLen;
00165
return true;
00166 }
00167 }
00168 bufLen = curLen;
00169 }
00170
return true;
00171 }
00172
00173
00174
00175
00176
00177
00178 boolean gnFASSource::SeqSeek(
const gnSeqI start,
const uint32 contigI,
uint64& startPos,
uint64& readableBytes )
00179 {
00180
if( contigI == ALL_CONTIGS ){
00181
00182
gnSeqI curIndex = 0;
00183 vector< gnFileContig* >::iterator iter =
m_contigList.begin();
00184
for( ; iter !=
m_contigList.end(); ++iter )
00185 {
00186
uint64 len = (*iter)->GetSeqLength();
00187
if( (curIndex + len) > start )
00188
break;
00189 curIndex += len;
00190 }
00191
if( iter ==
m_contigList.end() )
00192
return false;
00193
00194
gnSeqI startIndex = start - curIndex;
00195
return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00196 }
00197
else if( contigI <
m_contigList.size() )
00198 {
00199
return SeqStartPos( start, *(
m_contigList[contigI]), startPos, readableBytes );
00200 }
00201
return false;
00202 }
00203
00204
00205 boolean gnFASSource::SeqStartPos(
const gnSeqI start,
gnFileContig& contig,
uint64& startPos,
uint64& readableBytes ){
00206 readableBytes = 0;
00207
uint32 curLen = 0;
00208
00209 startPos = contig.
GetSectStartEnd(
gnContigSequence).first;
00210
00211
if(contig.
HasRepeatSeqGap())
00212
if(contig.
GetRepeatSeqGapSize().first > 0)
00213
if(contig.
GetRepeatSeqGapSize().second > 0){
00214
00215 startPos += start + (start/contig.
GetRepeatSeqGapSize().first)*contig.
GetRepeatSeqGapSize().second;
00216 readableBytes = contig.
GetSectStartEnd(
gnContigSequence).second - startPos;
00217 m_ifstream.seekg( startPos, ios::beg );
00218
return true;
00219 }
00220 m_ifstream.seekg( startPos, ios::beg );
00221
if( m_ifstream.eof() ){
00222
ErrorMsg(
"ERROR in gnFASSource::Incorrect contig start position, End of file reached!\n");
00223
return false;
00224 }
00225
while(
true ){
00226
00227
00228
uint32 tmpbufsize = contig.
GetSectStartEnd(
gnContigSequence).second - startPos;
00229
if(tmpbufsize == 0){
00230
ErrorMsg(
"ERROR in gnFASSource: stored contig size is incorrect.\n");
00231
return false;
00232 }
00233 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;
00234 Array<char> array_buf( tmpbufsize );
00235
char *tmpbuf = array_buf.data;
00236 m_ifstream.read( tmpbuf, tmpbufsize );
00237
if( m_ifstream.eof() ){
00238
ErrorMsg(
"ERROR in gnFASSource::Read End of file reached!\n");
00239
return false;
00240 }
00241
for(
uint32 i=0; i < tmpbufsize; ++i ){
00242
if( m_pFilter->
IsValid(tmpbuf[i]) ){
00243
if( curLen >= start ){
00244 startPos += i;
00245 m_ifstream.seekg( startPos, ios::beg );
00246 readableBytes = contig.
GetSectStartEnd(
gnContigSequence).second - startPos;
00247
return true;
00248 }
00249 ++curLen;
00250 }
00251 }
00252 startPos += tmpbufsize;
00253 }
00254
return true;
00255 }
00256
00257
00258 boolean gnFASSource::Write(
gnBaseSource *source,
const string& filename){
00259 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00260
if(!m_ofstream.is_open())
00261
return false;
00262
uint32 contigCount = source->
GetContigListLength();
00263
for(
uint32 contigI = 0; contigI < contigCount; contigI++){
00264 string contigName = source->
GetContigName(contigI);
00265 m_ofstream <<
">" << contigName <<
";\n";
00266
gnSeqI seqLength = source->
GetContigSeqLength(contigI);
00267
while(seqLength > 0){
00268
gnSeqI writeLen = seqLength < BUFFER_SIZE ? seqLength : BUFFER_SIZE;
00269 Array<gnSeqC> array_buf( writeLen+1 );
00270
gnSeqC *bases = array_buf.data;
00271
boolean success = source->
SeqRead(0, bases, writeLen, contigI);
00272
if(!success)
00273
return false;
00274 bases[writeLen] =
'\0';
00275 m_ofstream << bases <<
"\n";
00276 seqLength -= writeLen;
00277 }
00278 }
00279 m_ofstream.close();
00280
return true;
00281 }
00282
00283
00284 void gnFASSource::Write(
gnSequence& seq,
const string& filename,
boolean write_coords,
boolean enforce_unique_names){
00285 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00286
if(!m_ofstream.is_open())
00287
Throw_gnEx(FileNotOpened());
00288
Write(seq, m_ofstream, write_coords, enforce_unique_names);
00289 m_ofstream.close();
00290 }
00291
00292 void gnFASSource::Write(
gnSequence& seq, ostream& m_ostream,
boolean write_coords,
boolean enforce_unique_names){
00293 vector<string> contigNameList;
00294 Array<gnSeqC> array_buf( BUFFER_SIZE );
00295
gnSeqC *bases = array_buf.data;
00296
gnGenomeSpec* spec = seq.
GetSpec();
00297
00298
00299 string newline =
"\r\n";
00300
gnSeqI readOffset = 1;
00301
00302
for(
uint32 fragI = 0; fragI < seq.
contigListLength(); fragI++){
00303
00304 string contigName = seq.
contigName(fragI);
00305
00306
if(enforce_unique_names){
00307
uint32 name_count = 0;
00308
for(
uint32 i=0; i < contigNameList.size(); i++)
00309
if(contigNameList[i] == contigName)
00310 name_count++;
00311 contigNameList.push_back(contigName);
00312
if(name_count > 0)
00313 contigName +=
"_" +
uintToString(name_count);
00314 }
00315
00316
gnFragmentSpec* subSpec = spec->
GetSpec(fragI);
00317
gnBaseHeader *gpbh = subSpec->
GetHeader(0);
00318 string header =
"";
00319
if(gpbh != NULL){
00320 header = gpbh->
GetHeader();
00321
00322
uint32 newlinepos = header.find_first_of(
'\n', 0);
00323
if(newlinepos != string::npos){
00324
if(header[newlinepos-1] ==
'\r')
00325 newlinepos--;
00326 header = header.substr(0, newlinepos);
00327 }
00328 }
00329
00330
00331
gnSeqI readLength = seq.
contigLength(fragI);
00332 m_ostream <<
">" << contigName;
00333
if(write_coords)
00334 m_ostream <<
" " <<
"(" << readOffset <<
", " << readOffset + readLength - 1 <<
")";
00335 m_ostream <<
"; " << header << newline;
00336
00337
gnSeqI linePos = 0;
00338
while(readLength > 0){
00339
gnSeqI read_chunk_size = (BUFFER_SIZE /
FAS_LINE_WIDTH) *
FAS_LINE_WIDTH;
00340
gnSeqI writeLen = readLength < read_chunk_size ? readLength : read_chunk_size;
00341
00342
if(!seq.
ToArray(bases, writeLen, readOffset))
00343
return;
00344
for(
gnSeqI curbase = 0; curbase < writeLen; curbase +=
FAS_LINE_WIDTH){
00345
gnSeqI writeout_size = writeLen - curbase <
FAS_LINE_WIDTH ? writeLen - curbase :
FAS_LINE_WIDTH;
00346 m_ostream << string(bases + curbase, writeout_size) << newline;
00347 }
00348 readLength -= writeLen;
00349 readOffset += writeLen;
00350 }
00351
if(linePos != 0)
00352 m_ostream << newline;
00353 }
00354
00355 }
00356
00357 gnGenomeSpec *
gnFASSource::GetSpec()
const{
00358
gnGenomeSpec *spec =
new gnGenomeSpec();
00359
for(
uint32 i=0; i <
m_contigList.size(); i++){
00360
00361
gnFragmentSpec *fragmentSpec =
new gnFragmentSpec();
00362
gnSourceSpec *contigSpec =
new gnSourceSpec((
gnBaseSource*)
this, i);
00363
00364 spec->
AddSpec(fragmentSpec, i);
00365 fragmentSpec->
AddSpec(contigSpec);
00366
00367 fragmentSpec->
SetName(
m_contigList[i]->GetName());
00368 fragmentSpec->
SetSourceName(m_openString);
00369 contigSpec->
SetName(
m_contigList[i]->GetName());
00370 contigSpec->
SetSourceName(m_openString);
00371
00372 pair<uint32, uint32> headsect =
m_contigList[i]->GetSectStartEnd(
gnContigHeader);
00373
if(headsect.first != headsect.second){
00374
gnSourceHeader *gpsh =
new gnSourceHeader((
gnBaseSource *)
this, string(
""), headsect.first, headsect.second - headsect.first);
00375 fragmentSpec->
AddHeader(gpsh, 0);
00376 }
00377 }
00378
return spec;
00379 }
00380
00381 gnFileContig*
gnFASSource::GetFileContig(
const uint32 contigI )
const{
00382
if(
m_contigList.size() > contigI)
00383
return m_contigList[contigI];
00384
return NULL;
00385 }
00386
00387 boolean gnFASSource::ParseStream( istream& fin )
00388 {
00389
00390
uint32 readState = 0;
00391
gnFileContig* currentContig = 0;
00392 string nameFStr;
00393
uint64 seqLength = 0, gapLength = 0;
00394
00395
uint64 streamPos = 0;
00396
uint64 bufReadLen = 0;
00397 Array<gnSeqC> array_buf( BUFFER_SIZE );
00398
char* buf = array_buf.data;
00399
boolean paren_hit =
false;
00400
uint32 repeatSeqSize = 0;
00401
boolean corrupt_msg =
false;
00402
00403
00404
DetermineNewlineType();
00405
00406
00407
while( !fin.eof() )
00408 {
00409
00410 fin.read( buf, BUFFER_SIZE);
00411 streamPos += bufReadLen;
00412 bufReadLen = fin.gcount();
00413
for(
uint32 i=0 ; i < bufReadLen ; i++ )
00414 {
00415
char ch = buf[i];
00416
switch( readState )
00417 {
00418
case 0:
00419
if( !((buf[0] ==
'>') || !m_pFilter->
IsValid(buf[0])) )
00420 {
00421
return false;
00422 }
00423 readState = 1;
00424
case 1:
00425
if( ch ==
'>' )
00426 {
00427 seqLength = 0; gapLength = 0;
00428 currentContig =
new gnFileContig();
00429 currentContig->
SetFileStart( streamPos + i );
00430 currentContig->
SetRepeatSeqGap(
true);
00431 currentContig->
SetRepeatSeqSize( repeatSeqSize );
00432 currentContig->
SetRepeatGapSize( m_newlineSize );
00433 readState = 2;
00434 paren_hit =
false;
00435 nameFStr =
"";
00436 }
00437
else
00438 ++gapLength;
00439
break;
00440
case 2:
00441
if(
isNewLine(ch) || ch ==
';')
00442 {
00443 currentContig->
SetName( nameFStr );
00444 currentContig->
SetSectStart(
gnContigHeader, streamPos + i + 1 );
00445
if( ch ==
';' )
00446 readState = 3;
00447
else{
00448 readState = 4;
00449
if(ch ==
'\r')
00450 currentContig->
SetSectStart(
gnContigHeader, streamPos + i + 2 );
00451 }
00452 }
00453
else if( ch ==
'(' ){
00454
if(
isSpace(buf[i-1])){
00455
00456 nameFStr = nameFStr.substr(0, nameFStr.length()-1);
00457 }
00458 paren_hit =
true;
00459 }
else if((!
isSpace(ch) || nameFStr.length()) && !paren_hit )
00460 nameFStr += ch;
00461
break;
00462
case 3:
00463
if(
isNewLine(ch) )
00464 readState = 4;
00465
break;
00466
case 4:
00467
if( ch ==
'>' )
00468 {
00469 readState = 3;
00470 }
00471
else if( m_pFilter->
IsValid(ch) )
00472 {
00473 currentContig->
SetSectEnd(
gnContigHeader, streamPos + i );
00474 currentContig->
SetSectStart(
gnContigSequence, streamPos + i);
00475 seqLength = 1; gapLength = 0;
00476 readState = 5;
00477 }
00478
break;
00479
case 5:
00480
while(i < bufReadLen){
00481 ch = buf[i];
00482
if( m_pFilter->
IsValid(ch) ){
00483
if(gapLength > 0){
00484
if(seqLength != repeatSeqSize){
00485
if( !corrupt_msg ){
00486 corrupt_msg =
true;
00487
ErrorMsg(
"Sequence file appears corrupt, proceeding with caution\n" );
00488 }
00489 currentContig->
SetRepeatSeqGap(
false);
00490 }
if(gapLength != m_newlineSize){
00491
00492
if( !corrupt_msg ){
00493 corrupt_msg =
true;
00494
ErrorMsg(
"Sequence file appears corrupt, proceeding with caution\n" );
00495 }
00496 currentContig->
SetRepeatSeqGap(
false);
00497 }
00498 currentContig->
AddToSeqLength( seqLength );
00499 seqLength = 0;
00500 gapLength = 0;
00501 }
00502 seqLength++;
00503 }
else if( ch ==
'>'){
00504 currentContig->
AddToSeqLength( seqLength );
00505 currentContig->
SetSectEnd(
gnContigSequence, streamPos + i - 1);
00506 currentContig->
SetFileEnd( streamPos + i - 1 );
00507
m_contigList.push_back(currentContig);
00508 readState = 1;
00509 i--;
00510
break;
00511 }
00512
else if(
isNewLine(ch) ){
00513
if( repeatSeqSize == 0 ){
00514 repeatSeqSize = seqLength;
00515 currentContig->
SetRepeatSeqSize( repeatSeqSize );
00516 }
00517 gapLength++;
00518 }
00519
else{
00520
00521 currentContig->
SetRepeatSeqGap(
false);
00522
if( !corrupt_msg ){
00523 corrupt_msg =
true;
00524
ErrorMsg(
"Sequence file appears corrupt, proceeding with caution\n" );
00525 }
00526 }
00527 i++;
00528 }
00529
break;
00530
default:
00531
ErrorMsg(
"ERROR");
00532
return false;
00533
break;
00534 }
00535 }
00536 }
00537
00538
if( currentContig != 0 )
00539 {
00540
if( readState == 2 )
00541 {
00542 currentContig->
SetName( nameFStr );
00543 }
00544
if( (readState >= 2) && (readState < 5) )
00545 {
00546 currentContig->
SetSectEnd(
gnContigHeader, streamPos + bufReadLen );
00547 }
00548
else if( readState == 5 )
00549 {
00550 currentContig->
AddToSeqLength( seqLength );
00551 currentContig->
SetSectEnd(
gnContigSequence, streamPos + bufReadLen );
00552 }
00553 currentContig->
SetFileEnd( streamPos + bufReadLen );
00554
m_contigList.push_back(currentContig);
00555 }
00556 m_ifstream.clear();
00557
return true;
00558 }