00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00012
#include "gn/gnDNXSource.h"
00013
#include "gn/gnSourceSpec.h"
00014
#include "gn/gnStringSpec.h"
00015
#include "gn/gnSourceFactory.h"
00016
#include "gn/gnFASSource.h"
00017
#include "gn/gnGBKSource.h"
00018
#include "gn/gnBaseHeader.h"
00019
#include "gn/gnFilter.h"
00020
#include "gn/gnDebug.h"
00021
#include "gn/gnStringTools.h"
00022
#include <string>
00023
00024 gnDNXSource::gnDNXSource()
00025 {
00026
m_DNXSpec =
new gnGenomeSpec();
00027 m_pFilter =
gnFilter::fullDNASeqFilter();
00028
if(m_pFilter == NULL){
00029
DebugMsg(
"Error using static sequence filters.");
00030 }
00031 }
00032
00033 gnDNXSource::gnDNXSource(
const gnDNXSource& s ) :
gnFileSource(s)
00034 {
00035
if(s.
m_DNXSpec != NULL)
00036
m_DNXSpec = s.
m_DNXSpec->
Clone();
00037 }
00038
00039 gnDNXSource::~gnDNXSource()
00040 {
00041 m_ifstream.close();
00042
delete m_DNXSpec;
00043 }
00044 boolean gnDNXSource::HasContig(
const string& name )
const
00045
{
00046
for(
uint32 contigI = 0; contigI <
m_DNXSpec->
GetSpecListLength(); contigI++){
00047
if(
m_DNXSpec->
GetSpec(contigI)->
GetName() == name)
00048
return true;
00049 }
00050
return false;
00051 }
00052 uint32 gnDNXSource::GetContigID(
const string& name )
const
00053
{
00054
for(
uint32 contigI = 0; contigI <
m_DNXSpec->
GetSpecListLength(); contigI++){
00055
if(
m_DNXSpec->
GetSpec(contigI)->
GetName() == name)
00056
return contigI;
00057 }
00058
return ALL_CONTIGS;
00059 }
00060 string
gnDNXSource::GetContigName(
const uint32 i )
const
00061
{
00062
if(i <
m_DNXSpec->
GetSpecListLength()){
00063
gnBaseSpec *gnbs =
m_DNXSpec->
GetSpec(i);
00064
return gnbs->
GetName();
00065 }
00066
return "";
00067 }
00068 gnSeqI gnDNXSource::GetContigSeqLength(
const uint32 i )
const
00069
{
00070
if( i == ALL_CONTIGS){
00071
return m_DNXSpec->
GetLength();
00072 }
else if(i <
m_DNXSpec->
GetSpecListLength()){
00073
gnBaseSpec *gnbs =
m_DNXSpec->
GetSpec(i);
00074
return gnbs->
GetLength();
00075 }
00076
return 0;
00077 }
00078
00079
00080 void gnDNXSource::ValidateName(string& name){
00081
if(name ==
""){
00082 name.resize(4);
00083 srand(time(NULL));
00084
for(
int i=0; i < 4; i++)
00085 name[i] = (rand() % 26) + 64;
00086 }
00087 }
00088
00089 boolean gnDNXSource::Write(
gnGenomeSpec* spec,
const string& filename){
00090 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00091
gnSourceFactory* m_sSourceFactory =
gnSourceFactory::GetSourceFactory();
00092
if(!m_ofstream.is_open())
00093
return false;
00094
for(
uint32 i=0; i < spec->
GetSpecListLength(); i++){
00095
gnFragmentSpec* curStatementSpec = spec->
GetSpec(i);
00096 string sourceName = spec->
GetSourceName();
00097 string statementName = spec->
GetName();
00098
if(!m_sSourceFactory->
HasSource(sourceName)){
00099
ValidateName(statementName);
00100 statementName +=
".seq";
00101 m_ofstream << statementName <<
"=";
00102 }
else
00103 m_ofstream << sourceName <<
"=";
00104
for(
uint32 j=0; j < curStatementSpec->
GetSpecListLength(); j++){
00105
00106
gnContigSpec* curSubSpec = curStatementSpec->
GetSpec(i);
00107 sourceName = curStatementSpec->
GetSourceName();
00108 string contigName = curStatementSpec->
GetName();
00109
if(!m_sSourceFactory->
HasSource(sourceName)){
00110
ValidateName(contigName);
00111 string writename = contigName+
".seq";
00112
gnSequence gns = *curSubSpec;
00113
gnGBKSource::Write(gns, writename);
00114 m_ofstream << writename;
00115 }
else
00116 m_ofstream << sourceName;
00117
if( j + 1 < curStatementSpec->
GetSpecListLength())
00118 m_ofstream <<
"+";
00119 }
00120 m_ofstream <<
";";
00121
gnBaseHeader *gpbh = spec->
GetHeader(0);
00122 string header =
"";
00123
if(gpbh != NULL){
00124 header = gpbh->
GetHeader();
00125
00126
uint32 newlinepos = header.find_first_of(
'\n', 0);
00127
if(newlinepos != string::npos)
00128 header = header.substr(0, newlinepos - 1);
00129 }
00130 m_ofstream << header <<
"\r\n";
00131 }
00132 m_ofstream.close();
00133
return true;
00134 }
00135
00136 gnFileContig*
gnDNXSource::GetFileContig(
const uint32 contigI )
const{
00137
return NULL;
00138 }
00139
00140
00141 boolean gnDNXSource::ParseStream( istream& fin )
00142 {
00143
00144
uint32 readState = 0;
00145
uint32 sectionStart = 0;
00146
gnFragmentSpec* currentFragSpec = 0;
00147
gnBaseSource *currentSource;
00148 string currentSourceName;
00149
uint32 currentContig = ALL_CONTIGS;
00150
uint32 currentSeqStart = 0;
00151
boolean currentRevComp =
false;
00152
00153
uint64 bufReadLen = 0;
00154
uint64 remainingBuffer = 0;
00155 Array<char> array_buf( BUFFER_SIZE );
00156
char* buf = array_buf.data;
00157 string curliteral;
00158
00159
00160
gnSourceFactory *sourceFactory =
gnSourceFactory::GetSourceFactory();
00161 sourceFactory->
AddPath(
getPathString(m_openString));
00162
00163
while( !fin.eof() )
00164 {
00165
if(sectionStart > 0){
00166 remainingBuffer = bufReadLen - sectionStart;
00167
if(readState == 5){
00168 curliteral += string(buf, sectionStart, remainingBuffer);
00169 remainingBuffer = 0;
00170 sectionStart = bufReadLen;
00171 }
else
00172 memmove(buf, buf+sectionStart, remainingBuffer);
00173 }
00174
00175 fin.read( buf + remainingBuffer, BUFFER_SIZE - (bufReadLen - sectionStart));
00176 sectionStart = 0;
00177 bufReadLen = fin.gcount() + remainingBuffer;
00178
00179
for(
uint32 i=0 ; i < bufReadLen ; i++ )
00180 {
00181
char ch = buf[i];
00182
switch( readState )
00183 {
00184
case 0:
00185
if(ch ==
'='){
00186
00187 string contigName(buf+sectionStart, i - sectionStart);
00188 currentFragSpec =
new gnFragmentSpec();
00189 currentFragSpec->
SetName(contigName);
00190 currentFragSpec->
SetSourceName(m_openString);
00191
m_DNXSpec->
AddSpec(currentFragSpec);
00192 sectionStart = i+1;
00193 readState = 1;
00194 }
00195
break;
00196
case 1:
00197
if((ch ==
' ')||(ch ==
' '))
00198
break;
00199
case 2:
00200
if(ch ==
'"'){
00201 readState = 5;
00202 sectionStart = i+1;
00203
break;
00204 }
00205 readState = 3;
00206 sectionStart = i;
00207
case 3:
00208
00209
if(ch ==
'\n' && sectionStart == i -1){
00210
if(buf[sectionStart]==
'\r'){
00211 sectionStart = i + 1;
00212
break;
00213 }
00214 }
00215
if((ch ==
'+')||(ch ==
'>')||(ch ==
'(')||(ch ==
'\n')||(ch ==
';')){
00216
00217 string seqfile(buf, sectionStart, i - sectionStart);
00218 currentSourceName = seqfile;
00219 currentSource = sourceFactory->
AddSource(seqfile,
true);
00220
if (currentSource==NULL)
00221 {
00222
return false;
00223 }
00224
if((ch ==
'+')||(ch ==
'\n')||(ch ==
';')){
00225
gnSourceSpec* tmp_spec =
new gnSourceSpec(currentSource);
00226 tmp_spec->
SetSourceName(seqfile);
00227 currentFragSpec->
AddSpec(tmp_spec);
00228 readState = 1;
00229
if(ch ==
'\n'){
00230 readState = 0;
00231 }
else if(ch ==
';'){
00232 readState = 9;
00233 }
00234 }
else if(ch ==
'>'){
00235 readState = 4;
00236 }
else if(ch ==
'('){
00237 readState = 6;
00238 }
00239 sectionStart = i + 1;
00240 }
00241
break;
00242
case 4:
00243
00244
if((ch ==
'+')||(ch ==
'\n')||(ch ==
';')||(ch ==
'(')){
00245
00246 string contigname(buf, sectionStart, i - sectionStart);
00247 currentContig = currentSource->
GetContigID(contigname);
00248
if((ch ==
'+')||(ch ==
'\n')||(ch ==
';')){
00249
gnSourceSpec* tmp_spec =
new gnSourceSpec(currentSource, currentContig);
00250 tmp_spec->
SetSourceName(currentSourceName);
00251 currentFragSpec->
AddSpec(tmp_spec);
00252 readState = 1;
00253
if(ch ==
'\n'){
00254 readState = 0;
00255 }
else if(ch ==
';'){
00256 readState = 9;
00257 }
00258 }
else if(ch ==
'('){
00259 readState = 6;
00260 }
00261 sectionStart = i + 1;
00262 }
00263
break;
00264
case 5:
00265
00266
if(ch ==
'"'){
00267
00268 string literal(buf, sectionStart, i - sectionStart);
00269
if(curliteral.length() > 0){
00270 literal += curliteral;
00271 curliteral =
"";
00272 }
00273
gnStringSpec *gpss =
new gnStringSpec(literal, currentFragSpec->
GetSpecListLength());
00274 currentFragSpec->
AddSpec(gpss);
00275 }
00276
case 6:
00277
00278
if((ch ==
',') || (ch ==
'<') || (ch ==
'>')){
00279 string seqstartstring(buf, sectionStart, i - sectionStart);
00280
if(seqstartstring ==
"lend"){
00281 currentSeqStart = 0;
00282 }
else if (seqstartstring ==
"rend"){
00283 currentSeqStart =
GNSEQI_END;
00284 }
else
00285 currentSeqStart = atoi(seqstartstring.c_str()) - 1;
00286
if(ch ==
'<')
00287 currentRevComp =
true;
00288 sectionStart = i + 1;
00289 readState = 7;
00290 }
00291
break;
00292
case 7:
00293
00294
if(ch ==
')'){
00295 string seqendstring(buf, sectionStart, i - sectionStart);
00296
uint32 currentSeqEnd =
GNSEQI_END;
00297
if(seqendstring ==
"lend"){
00298 currentSeqEnd = 0;
00299 }
else if (seqendstring ==
"rend"){
00300 currentSeqEnd =
GNSEQI_END;
00301 }
else
00302 currentSeqEnd = atoi(seqendstring.c_str()) - 1;
00303
gnSourceSpec* tmp_spec =
new gnSourceSpec(currentSource, currentContig, currentSeqStart, currentSeqEnd, currentRevComp);
00304 tmp_spec->
SetSourceName(currentSourceName);
00305 currentFragSpec->
AddSpec(tmp_spec);
00306 currentRevComp =
false;
00307 sectionStart = i + 1;
00308 readState = 8;
00309 }
00310
break;
00311
case 8:
00312
if(ch ==
'+'){
00313 sectionStart = i + 1;
00314 readState = 1;
00315 }
00316
if(ch ==
'\n'){
00317 sectionStart = i + 1;
00318 readState = 0;
00319 }
00320
if(ch ==
';'){
00321 sectionStart = i + 1;
00322 readState = 9;
00323 }
00324
break;
00325
case 9:
00326
if(ch ==
'\n'){
00327 sectionStart = i + 1;
00328 readState = 0;
00329 }
00330
break;
00331
default:
00332
DebugMsg(
"ERROR in file\n");
00333
return false;
00334
break;
00335 }
00336 }
00337 }
00338
00339
return true;
00340 }