00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00012
#include "gn/gnFilter.h"
00013
#include "gn/gnFeature.h"
00014
#include "gn/gnGBKSource.h"
00015
#include "gn/gnSourceSpec.h"
00016
#include "gn/gnSourceHeader.h"
00017
#include "gn/gnSourceQualifier.h"
00018
#include "gn/gnLocation.h"
00019
#include "gn/gnStringTools.h"
00020
#include "gn/gnDebug.h"
00021
#include "gn/gnStringQualifier.h"
00022
#include <string>
00023
00024 gnGBKSource::gnGBKSource()
00025 {
00026 m_openString =
"";
00027 m_pFilter =
gnFilter::proteinSeqFilter();
00028
if(m_pFilter == NULL){
00029
DebugMsg(
"Error using static sequence filters.");
00030 }
00031 }
00032 gnGBKSource::gnGBKSource(
const gnGBKSource& s ) :
gnFileSource(s)
00033 {
00034 vector< gnFileContig* >::const_iterator iter = s.
m_contigList.begin();
00035
for( ; iter != s.
m_contigList.end(); ++iter )
00036 {
00037
m_contigList.push_back( (*iter)->Clone() );
00038 }
00039 }
00040 gnGBKSource::~gnGBKSource()
00041 {
00042 m_ifstream.close();
00043 vector< gnFileContig* >::iterator iter =
m_contigList.begin();
00044
for( ; iter !=
m_contigList.end(); ++iter )
00045 {
00046
gnFileContig* fg = *iter;
00047 *iter = 0;
00048
delete fg;
00049 }
00050 }
00051 boolean gnGBKSource::HasContig(
const string& name )
const
00052
{
00053
for(
uint32 i = 0 ; i <=
m_contigList.size(); i++ )
00054 {
00055
if( name ==
m_contigList[i]->GetName() )
00056
return true;
00057 }
00058
return false;
00059 }
00060 uint32 gnGBKSource::GetContigID(
const string& name )
const
00061
{
00062
for(
uint32 i = 0 ; i <=
m_contigList.size(); i++ )
00063 {
00064
if( name ==
m_contigList[i]->GetName() )
00065
return i;
00066 }
00067
return ALL_CONTIGS;
00068 }
00069 string
gnGBKSource::GetContigName(
const uint32 i )
const
00070
{
00071
if( i <
m_contigList.size() )
00072 {
00073
return m_contigList[i]->GetName();
00074 }
00075
return "";
00076 }
00077 gnSeqI gnGBKSource::GetContigSeqLength(
const uint32 i )
const
00078
{
00079
if( i == ALL_CONTIGS)
00080
return m_spec->
GetLength();
00081
if( i <
m_contigList.size() )
00082 {
00083
return m_contigList[i]->GetSeqLength();
00084 }
00085
return GNSEQI_ERROR;
00086 }
00087
00088 boolean gnGBKSource::SeqRead(
const gnSeqI start,
char* buf,
gnSeqI& bufLen,
const uint32 contigI ){
00089
uint64 startPos = 0;
00090
uint64 readableBytes = 0;
00091
if( !
SeqSeek( start, contigI, startPos, readableBytes ) )
00092 {
00093 bufLen = 0;
00094
return false;
00095 }
00096
00097
if( contigI == ALL_CONTIGS )
00098 {
00099
uint32 curLen = 0;
00100
uint64 bytesRead = 0;
00101
while (curLen < bufLen)
00102 {
00103
00104
if(readableBytes <= 0)
00105
if( !
SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){
00106 bufLen = curLen;
00107
return true;
00108 }
00109
00110
uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00111 Array<gnSeqC> array_buf( readLen );
00112
gnSeqC* tmpBuf = array_buf.data;
00113
00114
00115 m_ifstream.read(tmpBuf, readLen);
00116
uint64 gc = m_ifstream.gcount();
00117 bytesRead += gc;
00118 readableBytes -= gc;
00119
for(
uint32 i=0; i < gc; i++){
00120
if( m_pFilter->
IsValid(tmpBuf[i]) ){
00121 buf[curLen] = tmpBuf[i];
00122 curLen++;
00123 }
00124 }
00125
if(m_ifstream.eof()){
00126 m_ifstream.clear();
00127 bufLen = curLen;
00128
return true;
00129 }
00130 }
00131 bufLen = curLen;
00132 }
00133
else if( contigI <
m_contigList.size() )
00134 {
00135
uint32 curLen = 0;
00136
00137
gnSeqI contigSize =
m_contigList[contigI]->GetSeqLength();
00138 bufLen = bufLen < contigSize ? bufLen : contigSize;
00139
while (curLen < bufLen)
00140 {
00141
uint64 readLen = bufLen - curLen;
00142 Array<gnSeqC> array_buf( readLen );
00143
gnSeqC* tmpBuf = array_buf.data;
00144
00145
00146 m_ifstream.read(tmpBuf, readLen);
00147
uint64 gc = m_ifstream.gcount();
00148
00149
00150
for(
uint32 i=0; i < gc; i++){
00151
if( m_pFilter->
IsValid(tmpBuf[i]) ){
00152 buf[curLen] = tmpBuf[i];
00153 curLen++;
00154 }
00155 }
00156
if(m_ifstream.eof()){
00157 m_ifstream.clear();
00158 bufLen = curLen;
00159
return true;
00160 }
00161 }
00162 bufLen = curLen;
00163 }
00164
return true;
00165
00166 }
00167
00168
00169
00170
00171 boolean gnGBKSource::SeqSeek(
const gnSeqI start,
const uint32& contigI,
uint64& startPos,
uint64& readableBytes )
00172 {
00173
if( contigI == ALL_CONTIGS )
00174 {
00175
00176
gnSeqI curIndex = 0;
00177 vector< gnFileContig* >::iterator iter =
m_contigList.begin();
00178
for( ; iter !=
m_contigList.end(); ++iter )
00179 {
00180
uint64 len = (*iter)->GetSeqLength();
00181
if( (curIndex + len) > start )
00182
break;
00183 curIndex += len;
00184 }
00185
if( iter ==
m_contigList.end() )
00186
return false;
00187
00188
gnSeqI startIndex = start - curIndex;
00189
return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00190 }
00191
else if( contigI <
m_contigList.size() )
00192 {
00193
return SeqStartPos( start, *(
m_contigList[contigI]), startPos, readableBytes );
00194 }
00195
return false;
00196 }
00197
00198 boolean gnGBKSource::SeqStartPos(
const gnSeqI start,
gnFileContig& contig,
uint64& startPos,
uint64& readableBytes )
00199 {
00200 readableBytes = 0;
00201
uint32 curLen = 0;
00202
00203 startPos = contig.
GetSectStartEnd(
gnContigSequence).first;
00204 m_ifstream.seekg( startPos, ios::beg );
00205
if( m_ifstream.eof() ){
00206
ErrorMsg(
"ERROR in gnGBKSource::Incorrect contig start position, End of file reached!\n");
00207
return false;
00208 }
00209
while(
true )
00210 {
00211
00212
00213
uint32 tmpbufsize = contig.
GetSectStartEnd(
gnContigSequence).second - startPos;
00214
if(tmpbufsize == 0){
00215
ErrorMsg(
"ERROR in gnGBKSource: stored contig size is incorrect.");
00216
return false;
00217 }
00218
uint64 startOffset = start;
00219
if(contig.
HasRepeatSeqGap()){
00220 startOffset += (9 + m_newlineSize) * (start / 60 + 1) + start / 10 + 1;
00221
if( m_newlineSize == 2 )
00222 startOffset--;
00223 startPos+=startOffset;
00224 m_ifstream.seekg(startPos , ios::beg);
00225 readableBytes = contig.
GetSectStartEnd(
gnContigSequence).second - startPos;
00226
return true;
00227 }
00228
00229
00230 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;
00231 Array<char> array_buf( tmpbufsize );
00232
char* tmpbuf = array_buf.data;
00233
00234 m_ifstream.read( tmpbuf, tmpbufsize );
00235
if( m_ifstream.eof() ){
00236
ErrorMsg(
"ERROR in gnGBKSource::Read End of file reached!\n");
00237
return false;
00238 }
00239
for(
uint32 i=0; i < tmpbufsize; ++i ){
00240
if( m_pFilter->
IsValid(tmpbuf[i]) ){
00241
if( curLen >= start ){
00242 startPos += i;
00243 m_ifstream.seekg( startPos, ios::beg );
00244 readableBytes = contig.
GetSectStartEnd(
gnContigSequence).second - startPos;
00245
return true;
00246 }
00247 ++curLen;
00248 }
00249 }
00250 startPos += tmpbufsize;
00251 }
00252
return true;
00253 }
00254
00255 void gnGBKSource::FormatString(string& data,
uint32 offset,
uint32 width){
00256
00257 string::size_type newline_loc = data.find_first_of(
'\n', 0);
00258
while(newline_loc != string::npos){
00259
if(data[newline_loc-1] ==
'\r')
00260 newline_loc--;
00261 string::size_type text_loc = newline_loc;
00262
while((data[text_loc] ==
' ') ||(data[text_loc] ==
' ')||(data[text_loc] ==
'\n')||(data[text_loc] ==
'\r')){
00263 text_loc++;
00264
if(text_loc+1 == data.length())
00265
break;
00266 }
00267 data = (data.substr(0, newline_loc) +
" " + data.substr(text_loc));
00268 newline_loc = data.find_first_of(
'\n', 0);
00269 }
00270
00271 string output_string =
"";
00272
for(
uint32 charI = 0; charI < data.length();){
00273
00274 string::size_type base_loc = charI;
00275 string append_string;
00276
while(base_loc - charI <= width){
00277 string::size_type space_loc = data.find_first_of(
' ', base_loc+1);
00278
if(space_loc - charI < width)
00279 base_loc = space_loc;
00280
else if(base_loc == charI){
00281
00282 append_string = data.substr(charI, width);
00283 charI+=width;
00284 }
else{
00285 append_string = data.substr(charI, base_loc - charI);
00286 charI = base_loc;
00287 }
00288 }
00289 output_string += string(offset,
' ') + append_string;
00290
if(charI + width < data.length())
00291 output_string +=
"\r\n";
00292 }
00293 data = output_string;
00294 }
00295
00296
template<
class SubSpec >
00297 void WriteHeader(
gnMultiSpec< SubSpec >* spec,
const string& hdr, ofstream& m_ofstream) {
00298
gnBaseHeader* gpbh = NULL;
00299
uint32 header_index = 0;
00300
try{
00301
do{
00302 gpbh = spec->
GetHeader(hdr, header_index);
00303 m_ofstream << gpbh->
GetHeader();
00304 header_index++;
00305 }
while(gpbh != NULL);
00306 }
catch(
gnException& gne){}
00307 }
00308
00309 boolean gnGBKSource::Write(
gnSequence& seq,
const string& filename){
00310 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00311
if(!m_ofstream.is_open())
00312
return false;
00313
00314 string newline =
"\r\n";
00315
gnGenomeSpec* spec = seq.
GetSpec();
00316
00317
00318
if(spec->
GetHeaderListLength() == 1){
00319
gnBaseHeader *gpbh = spec->
GetHeader(0);
00320 string name = gpbh->
GetHeaderName();
00321
00322
if(string::npos != name.find(
".SEQ")){
00323 string header = gpbh->
GetHeader();
00324 m_ofstream << header;
00325 }
00326 }
00327
00328
00329
00330 Array<gnSeqC> array_buf( 2 * BUFFER_SIZE );
00331
gnSeqC *bases = array_buf.data;
00332
00333
for(
uint32 specI = 0; specI < spec->
GetSpecListLength(); specI++){
00334
gnFragmentSpec* subSpec = spec->
GetSpec(specI);
00335
00336
00337 m_ofstream <<
"LOCUS ";
00338
00339 string contigName = subSpec->
GetName();
00340
if(contigName.length() >
SEQ_LOCUS_NAME_LENGTH)
00341 contigName = contigName.substr(0,
SEQ_LOCUS_NAME_LENGTH);
00342
uint32 filler_size =
SEQ_LOCUS_NAME_LENGTH - contigName.length();
00343 m_ofstream << contigName << string(filler_size,
' ');
00344
00345 string length_string =
uintToString(subSpec->
GetLength());
00346 filler_size =
SEQ_LOCUS_SIZE_LENGTH - length_string.size();
00347 m_ofstream << string(filler_size,
' ') << length_string <<
" bp ";
00348
00349 string dnatype = string(
SEQ_LOCUS_DNATYPE_LENGTH,
' ');
00350
uint32 head_look_i = 0;
00351
gnBaseHeader* gpbh = NULL;
00352
try{
00353 gpbh = subSpec->
GetHeader(
"LOCUS", head_look_i);
00354 }
catch(
gnException& gne){}
00355
if( gpbh != NULL )
00356 dnatype = gpbh->
GetHeader().substr(
SEQ_LOCUS_DNATYPE_OFFSET,
SEQ_LOCUS_DNATYPE_LENGTH);
00357 m_ofstream << dnatype << string(2,
' ');
00358
00359 string circular = subSpec->
IsCircular() ? string(
"circular ") : string(10,
' ');
00360 m_ofstream << circular;
00361
00362 string division = string(
SEQ_LOCUS_DIVCODE_LENGTH,
' ');
00363
if(gpbh != NULL)
00364 division = gpbh->
GetHeader().substr(
SEQ_LOCUS_DIVCODE_OFFSET,
SEQ_LOCUS_DIVCODE_LENGTH);
00365 m_ofstream << division;
00366
00367 string date = string(
SEQ_LOCUS_DATE_LENGTH,
' ');
00368
if(gpbh != NULL)
00369 date = gpbh->
GetHeader().substr(
SEQ_LOCUS_DATE_OFFSET,
SEQ_LOCUS_DATE_LENGTH);
00370 m_ofstream << string(7,
' ') << date <<
"\r\n";
00371
00372
00373
WriteHeader(subSpec,
"DEFINITION", m_ofstream);
00374
WriteHeader(subSpec,
"ACCESSION", m_ofstream);
00375
WriteHeader(subSpec,
"VERSION", m_ofstream);
00376
WriteHeader(subSpec,
"KEYWORDS", m_ofstream);
00377
WriteHeader(subSpec,
"SEGMENT", m_ofstream);
00378
WriteHeader(subSpec,
"SOURCE", m_ofstream);
00379
WriteHeader(subSpec,
"REFERENCE", m_ofstream);
00380
WriteHeader(subSpec,
"COMMENT", m_ofstream);
00381
00382
00383 m_ofstream <<
"FEATURES Location/Qualifiers" <<
"\r\n";
00384
for(
uint32 featureI = 0; featureI < subSpec->
GetFeatureListLength(); featureI++){
00385
00386
gnBaseFeature *gpmf = subSpec->
GetFeature(featureI);
00387 string featureName = gpmf->
GetName();
00388 m_ofstream << string(
SEQ_SUBTAG_COLUMN,
' ') << featureName;
00389 m_ofstream << string(
SEQ_FEATURE_LOC_OFFSET - featureName.length() -
SEQ_SUBTAG_COLUMN,
' ');
00390
00391
uint32 location_count = gpmf->
GetLocationListLength();
00392
uint32 line_pos =
SEQ_FEATURE_LOC_OFFSET;
00393
uint32 parenthesis_count = 0;
00394
if(location_count > 1){
00395 m_ofstream <<
"join(";
00396 line_pos += 5;
00397 parenthesis_count++;
00398 }
00399 gnLocation::gnLocationType loc_type = gpmf->
GetLocationType();
00400
switch(loc_type){
00401
case gnLocation::LT_Standard:
00402
break;
00403
case gnLocation::LT_Complement:
00404 m_ofstream <<
"complement(";
00405 line_pos += 11;
00406 parenthesis_count++;
00407
break;
00408
case gnLocation::LT_Order:
00409 m_ofstream <<
"order(";
00410 line_pos += 6;
00411 parenthesis_count++;
00412
break;
00413
case gnLocation::LT_Group:
00414 m_ofstream <<
"group(";
00415 parenthesis_count++;
00416 line_pos += 6;
00417
break;
00418
case gnLocation::LT_OneOf:
00419 m_ofstream <<
"one-of(";
00420 parenthesis_count++;
00421 line_pos += 7;
00422
break;
00423
default:
00424
break;
00425 }
00426
00427 string location;
00428
for(
uint32 locationI = 0; locationI < location_count; locationI++){
00429
gnLocation gpl = gpmf->
GetLocation(locationI);
00430
if(gpl.
IsStartBoundLonger())
00431 location +=
">";
00432
if(gpl.
IsStartBoundShorter())
00433 location +=
"<";
00434 location +=
uintToString(gpl.
GetStart());
00435
gnSeqI end_loc = gpl.
GetEnd();
00436
if(end_loc != 0){
00437
switch(gpl.
GetType()){
00438
case gnLocation::LT_BetweenBases:
00439 location +=
"^";
00440
break;
00441
case gnLocation::LT_OneOf:
00442 location +=
".";
00443
break;
00444
default:
00445 location +=
"..";
00446
break;
00447 }
00448
if(gpl.
IsEndBoundShorter())
00449 location +=
"<";
00450
if(gpl.
IsEndBoundLonger())
00451 location +=
">";
00452 location+=
uintToString(end_loc);
00453 }
00454
if(locationI +1 < location_count)
00455 location +=
",";
00456
else{
00457
for(;parenthesis_count > 0; parenthesis_count--)
00458 location +=
")";
00459 }
00460
00461
if(line_pos + location.length() <
SEQ_COLUMN_WIDTH){
00462 m_ofstream << location;
00463 line_pos += location.length();
00464 }
else{
00465 m_ofstream <<
"\r\n" << string(
SEQ_FEATURE_LOC_OFFSET,
' ') << location;
00466 line_pos =
SEQ_FEATURE_LOC_OFFSET + location.length();
00467 }
00468 location =
"";
00469 }
00470 m_ofstream <<
"\r\n";
00471
00472
00473
00474
uint32 qualifier_count = gpmf->
GetQualifierListLength();
00475
for(
uint32 qualifierI = 0; qualifierI < qualifier_count; qualifierI++){
00476 m_ofstream << string(
SEQ_FEATURE_LOC_OFFSET,
' ');
00477
gnBaseQualifier* qualifier = gpmf->
GetQualifier(qualifierI);
00478 m_ofstream <<
"/" << qualifier->
GetName() <<
"=";
00479
00480 string qually = string(qualifier->
GetValue());
00481
00482
00483 m_ofstream << qually <<
"\r\n";
00484 }
00485
if(gpmf != NULL)
00486
delete gpmf;
00487 }
00488
00489
00490
gnSeqI readOffset = seq.
contigStart(specI);
00491
gnSeqI readLength = seq.
contigLength(specI);
00492
00493
00494 m_ofstream <<
"BASE COUNT ";
00495
gnSeqI a_count=0, c_count=0, g_count=0, t_count=0, other_count=0;
00496
gnSeqI countLen = readLength + readOffset;
00497
for(
gnSeqI countI = readOffset; countI < countLen;){
00498
gnSeqI writeLen = countLen - countI < BUFFER_SIZE ? countLen - countI : BUFFER_SIZE;
00499
if(!seq.
ToArray(bases, writeLen, countI))
00500
return false;
00501
gnSeqI a, c, g, t, other;
00502
BaseCount(string(bases, writeLen), a, c, g, t, other);
00503 a_count += a;
00504 c_count += c;
00505 g_count += g;
00506 t_count += t;
00507 other_count += other;
00508 countI += writeLen;
00509 }
00510 m_ofstream <<
uintToString(a_count) <<
" a ";
00511 m_ofstream <<
uintToString(c_count) <<
" c ";
00512 m_ofstream <<
uintToString(g_count) <<
" g ";
00513 m_ofstream <<
uintToString(t_count) <<
" t ";
00514 m_ofstream <<
uintToString(other_count) <<
" others" <<
"\r\n";
00515
00516 string origin =
"ORIGIN\r\n";
00517 head_look_i = 0;
00518
try{
00519 gpbh = subSpec->
GetHeader(
"ORIGIN", head_look_i);
00520 origin = gpbh->
GetHeader();
00521 m_ofstream << origin;
00522 }
catch(
gnException& gne){
00523 m_ofstream <<
"ORIGIN" << endl;
00524 }
00525
00526
gnSeqI contig_bases = 0;
00527
while(readLength > 0){
00528
gnSeqI writeLen = readLength < BUFFER_SIZE + 20 ? readLength : BUFFER_SIZE + 20;
00529
boolean success = seq.
ToArray(bases, writeLen, readOffset);
00530
if(!success)
00531
return false;
00532
00533
for(
gnSeqI curbaseI = 0; curbaseI < writeLen; curbaseI += 60){
00534 string baseIndexStr =
uintToString(contig_bases + curbaseI +1);
00535 m_ofstream << string(
SEQ_BASES_INDEX_END - baseIndexStr.length(),
' ');
00536 m_ofstream << baseIndexStr;
00537
for(
gnSeqI base_offset = 0; base_offset <= 50; base_offset+=10){
00538
if(writeLen <= curbaseI + base_offset)
00539
break;
00540
int64 print_length = writeLen - (curbaseI + base_offset);
00541 print_length = print_length > 10 ? 10 : print_length;
00542 m_ofstream <<
' ' << string(bases + curbaseI + base_offset, print_length);
00543 }
00544 m_ofstream <<
"\r\n";
00545 }
00546 readLength -= writeLen;
00547 readOffset += writeLen;
00548 contig_bases += writeLen;
00549 }
00550 m_ofstream <<
"//\r\n";
00551 }
00552
00553 m_ofstream.close();
00554
return true;
00555 }
00556
00557 gnFileContig*
gnGBKSource::GetFileContig(
const uint32 contigI )
const{
00558
if(
m_contigList.size() > contigI)
00559
return m_contigList[contigI];
00560
return NULL;
00561 }
00562
00563
00564 boolean gnGBKSource::ParseStream( istream& fin )
00565 {
00566
00567
uint32 readState = 0;
00568
uint32 lineStart = 0;
00569
00570
uint32 sectionStart = 0;
00571
uint64 streamPos = 0;
00572
uint64 bufReadLen = 0;
00573
uint64 remainingBuffer = 0;
00574 Array<char> array_buf( BUFFER_SIZE );
00575
char* buf = array_buf.data;
00576
gnFragmentSpec* curFrag = 0;
00577
gnSourceSpec* curSpec = 0;
00578
gnSourceHeader *curHeader;
00579
gnFeature* curFeature;
00580
gnFileContig* curContig = 0;
00581 gnLocation::gnLocationType curBaseLocationType;
00582
gnSeqI curLocationStart;
00583
int32 curStartLength = 0;
00584
int32 curEndLength = 0;
00585 string curLocContig =
"";
00586 string curQualifierName;
00587
uint64 curQualifierStart;
00588 string curContigName =
"";
00589
gnSeqI seqLength = 0;
00590
gnSeqI seqChunk, seqChunkCount, gapChunk;
00591
boolean corruptWarning =
false;
00592
00593
00594
DetermineNewlineType();
00595
00596
m_spec =
new gnGenomeSpec();
00597
while( !fin.eof() )
00598 {
00599
if(sectionStart > 0){
00600
if(readState == 14)
00601 sectionStart = lineStart;
00602 remainingBuffer = bufReadLen - sectionStart;
00603 memmove(buf, buf+sectionStart, remainingBuffer);
00604 }
00605
00606 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer);
00607 streamPos -= remainingBuffer;
00608 lineStart -= sectionStart;
00609 sectionStart = 0;
00610 bufReadLen = fin.gcount();
00611 bufReadLen += remainingBuffer;
00612
00613
for(
uint32 i=remainingBuffer ; i < bufReadLen ; i++ )
00614 {
00615
char ch = buf[i];
00616
switch( readState )
00617 {
00618
case 0:
00619
00620
if((ch ==
'\n')&&(buf[lineStart] !=
' ')&&(buf[lineStart] !=
' ')){
00621
if(curSpec == NULL){
00622 curSpec =
new gnSourceSpec(
this,
m_spec->
GetSpecListLength());
00623 curFrag =
new gnFragmentSpec();
00624 curFrag->
AddSpec(curSpec);
00625 curSpec->
SetSourceName(m_openString);
00626
m_spec->
AddSpec(curFrag);
00627 }
00628
if(lineStart != sectionStart){
00629
uint32 j =
SEQ_HEADER_NAME_LENGTH-1;
00630
for(; j > 0; j--)
00631
if((buf[sectionStart+j] !=
' ')&&(buf[sectionStart+j] !=
' '))
00632
break;
00633 string header_name = string(buf+sectionStart, j+1);
00634 curHeader =
new gnSourceHeader(
this, header_name, sectionStart + streamPos, lineStart - sectionStart);
00635
00636
if(strncmp(&buf[lineStart],
"LOCUS", 5) == 0)
00637
m_spec->
AddHeader(curHeader);
00638
else
00639 curFrag->
AddHeader(curHeader);
00640 sectionStart = lineStart;
00641 }
00642
00643
if(strncmp(&buf[lineStart],
"FEATURES", 8) == 0){
00644 sectionStart = i + 1;
00645 readState = 1;
00646 }
else if(strncmp(&buf[lineStart],
"ORIGIN", 6) == 0){
00647 curHeader =
new gnSourceHeader(
this, string(
"ORIGIN"), sectionStart + streamPos, i - sectionStart + 1);
00648 curFrag->
AddHeader(curHeader);
00649 curContig =
new gnFileContig();
00650 curContig->
SetName(curContigName);
00651 curContigName =
"";
00652 readState = 13;
00653 }
else if(strncmp(&buf[lineStart],
"LOCUS", 5) == 0){
00654
if(strncmp(&buf[lineStart+
SEQ_LOCUS_CIRCULAR_COLUMN-1],
"circular", 8) == 0)
00655 curFrag->
SetCircular(
true);
00656
uint32 j =
SEQ_LOCUS_NAME_LENGTH+1;
00657
for(; j > 0; j--)
00658
if((buf[lineStart+
SEQ_LOCUS_NAME_COLUMN+j-1] !=
' ')&&(buf[sectionStart+
SEQ_LOCUS_NAME_COLUMN+j-1] !=
' '))
00659
break;
00660 curContigName = string(buf+lineStart+
SEQ_LOCUS_NAME_COLUMN-1, j+1);
00661 curFrag->
SetName(curContigName);
00662 }
00663 }
00664
if(ch ==
'\n'){
00665 lineStart = i + 1;
00666 }
00667
break;
00668
case 1:
00669
if((ch ==
' ')||(ch ==
' ')){
00670
break;
00671 }
else if(ch ==
'\n'){
00672 lineStart = i + 1;
00673 sectionStart = i + 1;
00674
break;
00675 }
else if(sectionStart == i){
00676 i--;
00677 readState = 0;
00678 sectionStart = i + 1;
00679
break;
00680 }
else if((i - lineStart ==
SEQ_SUBTAG_COLUMN)||((buf[lineStart]==
' ')&&(i==lineStart+1))){
00681 sectionStart = i;
00682 readState = 2;
00683 }
00684
case 2:
00685
if((ch ==
' ')||(ch ==
' ')){
00686 string featureName(buf+sectionStart, i - sectionStart);
00687 curFeature =
new gnFeature(featureName);
00688 curFrag->
AddFeature(curFeature);
00689 sectionStart = i + 1;
00690 readState = 3;
00691 }
00692
break;
00693
case 3:
00694
if((ch ==
' ')||(ch ==
' ')){
00695
break;
00696 }
else if((ch ==
'\r')||(ch ==
'\n')){
00697 lineStart = i+1;
00698
break;
00699 }
00700 sectionStart = i;
00701 readState = 4;
00702
00703
00704
00705
case 4:
00706
if((ch ==
' ')||(ch ==
' ')||(ch ==
'(')||(ch ==
'.')||(ch==
'^')||(ch==
':')){
00707 string starter(buf+sectionStart, i - sectionStart);
00708
if(ch ==
'('){
00709
if(starter ==
"complement")
00710 curFeature->
SetLocationType(gnLocation::LT_Complement);
00711
else if(starter ==
"order")
00712 curFeature->
SetLocationType(gnLocation::LT_Order);
00713
else if(starter ==
"group")
00714 curFeature->
SetLocationType(gnLocation::LT_Group);
00715
else if(starter ==
"one-of")
00716 curFeature->
SetLocationType(gnLocation::LT_OneOf);
00717 sectionStart = i + 1;
00718
break;
00719 }
else if(ch ==
':'){
00720 curLocContig = starter;
00721 sectionStart = i + 1;
00722
break;
00723 }
00724 curLocationStart = atoi(starter.c_str());
00725 readState = 6;
00726
if(ch ==
'.'){
00727
00728 readState = 5;
00729 sectionStart = i + 1;
00730
break;
00731 }
else if(ch ==
'^'){
00732 curBaseLocationType = gnLocation::LT_BetweenBases;
00733 }
else if((ch ==
' ')||(ch ==
' ')){
00734
00735
gnLocation curLocation(curLocationStart, curLocationStart);
00736 curFeature->
AddLocation(curLocation, curFeature->
GetLocationListLength());
00737 readState = 7;
00738 }
00739 sectionStart = i + 1;
00740
00741 }
else if(ch ==
'<'){
00742 curStartLength = -1;
00743 sectionStart = i + 1;
00744 }
else if(ch ==
'>'){
00745 curStartLength = 1;
00746 sectionStart = i + 1;
00747 }
00748
break;
00749
case 5:
00750
if(ch ==
'.'){
00751 curBaseLocationType = gnLocation::LT_Standard;
00752 readState = 6;
00753 sectionStart = i + 1;
00754
break;
00755 }
00756 curBaseLocationType = gnLocation::LT_OneOf;
00757
case 6:
00758
if(ch ==
'>'){
00759 curEndLength = 1;
00760 sectionStart = i + 1;
00761 }
else if(ch ==
'<'){
00762 curEndLength = -1;
00763 sectionStart = i + 1;
00764 }
else if((ch ==
' ')||(ch ==
' ')||(ch ==
',')){
00765
00766 string ender(buf+sectionStart, i - sectionStart);
00767
gnSeqI curLocationEnd = atoi(ender.c_str());
00768
gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType);
00769 curEndLength = 0;
00770 curStartLength = 0;
00771 curFeature->
AddLocation(curLocation, curFeature->
GetLocationListLength());
00772 readState = ch ==
',' ? 3 : 7;
00773 sectionStart = i+1;
00774 }
00775
break;
00776
case 7:
00777
if((ch !=
' ')&&(ch !=
' ')&&(lineStart == i)){
00778 sectionStart = i;
00779 readState = 0;
00780 i--;
00781 }
else if((ch !=
' ')&&(ch !=
' ')&&((lineStart == i -
SEQ_SUBTAG_COLUMN)||((buf[lineStart]==
' ')&&(i==lineStart+1)))){
00782 sectionStart = i;
00783 readState = 2;
00784 i--;
00785 }
else if(ch ==
','){
00786 sectionStart = i+1;
00787 readState = 3;
00788 }
else if(ch ==
'/'){
00789 sectionStart = i+1;
00790 readState = 8;
00791 }
else if(ch ==
'\n')
00792 lineStart = i + 1;
00793
break;
00794
case 8:
00795
if(ch ==
'='){
00796 curQualifierName = string(buf+sectionStart, i - sectionStart);
00797 readState = 9;
00798 sectionStart = i+1;
00799 }
else if( ch ==
'\r' || ch ==
'\n' ){
00800
00801 curQualifierName = string(buf+sectionStart, i - sectionStart);
00802 curFeature->
AddQualifier(
new gnStringQualifier( curQualifierName,
"" ));
00803 readState = 7;
00804 sectionStart = i+1;
00805 }
00806
break;
00807
case 9:
00808
if(ch ==
'"'){
00809 readState = 10;
00810 sectionStart = i;
00811 curQualifierStart = i + streamPos;
00812 }
else if(ch ==
'['){
00813 readState = 11;
00814 sectionStart = i;
00815 }
else if((ch ==
'\r')||(ch ==
'\n')){
00816 curFeature->
AddQualifier(
new gnSourceQualifier(
this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00817 sectionStart = i+1;
00818 readState = 7;
00819 }
00820
break;
00821
case 10:
00822
if(ch ==
'"')
00823 readState = 11;
00824
if(ch ==
'\n'){
00825 lineStart = i + 1;
00826 }
00827
break;
00828
case 11:
00829
if(ch !=
'"'){
00830 curFeature->
AddQualifier(
new gnSourceQualifier(
this, curQualifierName, curQualifierStart, i - sectionStart));
00831 sectionStart = i+1;
00832 readState = 7;
00833
if(ch ==
'\n')
00834 lineStart = i + 1;
00835 }
else
00836 readState = 10;
00837
break;
00838
case 12:
00839
if(ch ==
']'){
00840 curFeature->
AddQualifier(
new gnSourceQualifier(
this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00841 sectionStart = i+1;
00842 readState = 7;
00843 }
00844
break;
00845
case 13:
00846 curContig->
SetSectStart(
gnContigSequence, i - 1 + streamPos);
00847 curContig->
SetRepeatSeqGap(
true);
00848 seqChunk = 0;
00849 seqChunkCount = 0;
00850 gapChunk = m_newlineSize + 1;
00851 readState = 14;
00852
break;
00853
case 14:
00854
while(i < bufReadLen){
00855 ch = buf[i];
00856
if((ch ==
'/')&&(i==lineStart)){
00857 readState = 15;
00858
break;
00859 }
else if(m_pFilter->
IsValid(ch)){
00860
if(gapChunk > 0){
00861
if((gapChunk > 1 && seqChunkCount > 0) ||
00862 (gapChunk != 10 + m_newlineSize && seqChunkCount == 0)){
00863
if( !corruptWarning ){
00864
ErrorMsg(
"File is corrupt. Proceed with caution.");
00865 corruptWarning =
true;
00866 }
00867 curContig->
SetRepeatSeqGap(
false);
00868 }
00869 gapChunk = 0;
00870 }
00871 seqChunk++;
00872 seqLength++;
00873 }
else{
00874 gapChunk++;
00875
if(seqChunk == 10){
00876 seqChunk = 0;
00877 seqChunkCount++;
00878
if(seqChunkCount == 6){
00879
00880 seqChunkCount = 0;
00881 }
00882 }
00883
if(ch ==
'\n')
00884 lineStart = i + 1;
00885 }
00886 i++;
00887 }
00888
break;
00889
case 15:
00890
if((ch ==
'\n')&&(buf[lineStart+1] ==
'/')){
00891 curContig->
SetSectEnd(
gnContigSequence, lineStart - m_newlineSize + streamPos);
00892 curContig->
SetSeqLength(seqLength);
00893
m_contigList.push_back(curContig);
00894 curContig = 0;
00895 curSpec->
SetLength(seqLength);
00896 curSpec = 0;
00897 seqLength = 0;
00898 lineStart = i + 1;
00899 sectionStart = i + 1;
00900 readState = 0;
00901 }
00902
break;
00903 }
00904 }
00905 streamPos += bufReadLen;
00906 }
00907
if(curContig != 0){
00908 curContig->
SetSectEnd(
gnContigSequence, streamPos - 1);
00909 curContig->
SetSeqLength(seqLength);
00910
m_contigList.push_back(curContig);
00911 curSpec->
SetLength(seqLength);
00912 }
00913
if(curSpec != 0)
00914
if((curFrag->
GetFeatureListLength() == 0) && (curFrag->
GetHeaderListLength() == 0)
00915 &&(curSpec->
GetLength() == 0)){
00916
m_spec->
RemoveSpec(
m_spec->
GetSpecListLength() - 1);
00917
delete curFrag;
00918 }
00919 m_ifstream.clear();
00920
return true;
00921 }