src/gnSequence.cpp

Go to the documentation of this file.
00001 00002 // File: gnSequence.h 00003 // Purpose: Source Seq, where sequence is stored in source file 00004 // Description: implements the baseSeq interface for dna source files. 00005 // Changes: 00006 // Version: libGenome 0.5.1 00007 // Author: Aaron Darling 00008 // Modified by: 00009 // Copyright: (c) Aaron Darling 00010 // Licenses: See COPYING file for details 00012 #include "gn/gnSequence.h" 00013 #include "gn/gnSourceFactory.h" 00014 #include "gn/gnBaseSource.h" 00015 #include "gn/gnGenomeSpec.h" 00016 #include "gn/gnFragmentSpec.h" 00017 #include "gn/gnSourceSpec.h" 00018 #include "gn/gnStringSpec.h" 00019 #include "gn/gnStringHeader.h" 00020 00021 // Constructor 00022 gnSequence::gnSequence( ) 00023 { 00024 spec = new gnGenomeSpec(); 00025 comparator = gnCompare::DNASeqCompare(); 00026 } 00027 gnSequence::gnSequence( const gnSeqC* seq){ 00028 spec = new gnGenomeSpec(); 00029 if (seq[0] != 0){ 00030 gnFragmentSpec *fragSpec = new gnFragmentSpec(); 00031 spec->AddSpec(fragSpec); 00032 fragSpec->AddSpec(new gnStringSpec(seq)); 00033 } 00034 comparator = gnCompare::DNASeqCompare(); 00035 } 00036 gnSequence::gnSequence( const string& str){ 00037 spec = new gnGenomeSpec(); 00038 if (str.length() != 0){ 00039 gnFragmentSpec *fragSpec = new gnFragmentSpec(); 00040 spec->AddSpec(fragSpec); 00041 fragSpec->AddSpec(new gnStringSpec(str)); 00042 } 00043 comparator = gnCompare::DNASeqCompare(); 00044 } 00045 gnSequence::gnSequence( const gnGenomeSpec& gngs){ 00046 spec = gngs.Clone(); 00047 comparator = gnCompare::DNASeqCompare(); 00048 } 00049 gnSequence::gnSequence( const gnFragmentSpec& gnfs){ 00050 spec = new gnGenomeSpec(); 00051 spec->AddSpec(gnfs.Clone()); 00052 comparator = gnCompare::DNASeqCompare(); 00053 } 00054 gnSequence::gnSequence( const gnContigSpec& gcs){ 00055 spec = new gnGenomeSpec(); 00056 gnFragmentSpec *fragSpec = new gnFragmentSpec(); 00057 fragSpec->AddSpec(gcs.Clone()); 00058 comparator = gnCompare::DNASeqCompare(); 00059 } 00060 gnSequence::gnSequence( gnSeqC *bases, gnSeqI length){ 00061 spec = new gnGenomeSpec(); 00062 if (length > 0){ 00063 gnFragmentSpec *fragSpec = new gnFragmentSpec(); 00064 spec->AddSpec(fragSpec); 00065 fragSpec->AddSpec(new gnStringSpec(string(bases, length))); 00066 } 00067 comparator = gnCompare::DNASeqCompare(); 00068 } 00069 gnSequence::gnSequence( const gnSequence& seq ) 00070 { 00071 *this = seq; 00072 } 00073 00074 // Destructor 00075 gnSequence::~gnSequence() 00076 { 00077 if (spec != NULL) 00078 delete spec; 00079 } 00080 00081 gnSequence& gnSequence::operator=( const gnSequence& seq){ 00082 spec = seq.spec->Clone(); 00083 filter_list = seq.filter_list; 00084 comparator = seq.comparator; 00085 return *this; 00086 } 00087 00088 // Clone 00089 gnSequence* gnSequence::Clone() const 00090 { 00091 return new gnSequence( *this ); 00092 } 00093 //IMPLEMENT ME!! Consider ambiguities! 00094 int gnSequence::compare(const string& str) const{ 00095 STACK_TRACE_START 00096 gnSeqI len = length(); 00097 gnSeqI seq_len = str.length(); 00098 gnSeqI comparelen = len < seq_len ? len : seq_len; 00099 gnSeqI compared = 0; 00100 while(comparelen > 0){ 00101 gnSeqI curlen = comparelen < BUFFER_SIZE ? comparelen : BUFFER_SIZE; 00102 string bases = ToString(compared, curlen); 00103 string seq_bases = str.substr(compared, curlen); 00104 if(comparator->LessThan(bases, seq_bases)) 00105 return -1; 00106 else if(comparator->LessThan(seq_bases, bases)) 00107 return 1; 00108 00109 compared += curlen; 00110 comparelen -= curlen; 00111 } 00112 if(len < seq_len) 00113 return -1; 00114 else if(len > seq_len) 00115 return 1; 00116 return 0; 00117 STACK_TRACE_END 00118 } 00119 00120 int gnSequence::compare(const gnSequence& seq) const{ 00121 STACK_TRACE_START 00122 gnSeqI len = length(); 00123 gnSeqI seq_len = seq.length(); 00124 gnSeqI comparelen = len < seq_len ? len : seq_len; 00125 gnSeqI compared = 0; 00126 while(comparelen > 0){ 00127 gnSeqI curlen = comparelen < BUFFER_SIZE ? comparelen : BUFFER_SIZE; 00128 string bases = ToString(curlen, compared+1); 00129 string seq_bases = seq.ToString(curlen, compared+1); 00130 if(comparator->LessThan(bases, seq_bases)) 00131 return -1; 00132 else if(comparator->LessThan(seq_bases, bases)) 00133 return 1; 00134 00135 compared+= curlen; 00136 comparelen -= curlen; 00137 } 00138 if(len < seq_len) 00139 return -1; 00140 else if(len > seq_len) 00141 return 1; 00142 return 0; 00143 STACK_TRACE_END 00144 } 00145 00146 void gnSequence::insert( const gnSeqI offset, const gnSeqC *bases, const gnSeqI len){ 00147 STACK_TRACE_START 00148 string str(bases, len); 00149 gnStringSpec gpbs(str); 00150 insert(offset, gpbs); 00151 STACK_TRACE_END 00152 } 00153 //Offset is the gene index, not a computer index, to insert before. 00154 //This is the default insert. It inserts entire contigs 00155 void gnSequence::insert( const gnSeqI offset, const gnGenomeSpec& gpbs){ 00156 STACK_TRACE_START 00157 if(offset == 0) 00158 Throw_gnEx(SeqIndexOutOfBounds()); 00159 if(offset == GNSEQI_END || spec->GetLength() < offset){ //simple append. 00160 for(uint32 i=0; i < gpbs.GetSpecListLength(); i++) 00161 spec->AddSpec(gpbs.GetSpec(i)->Clone()); 00162 return; 00163 } 00164 //clone this sequence 00165 gnGenomeSpec* tmpSpec = spec->Clone(); 00166 //crop off the end of this sequence past the insert point 00167 //crop off the beginning of the cloned sequence up to the insert 00168 gnSeqI real_offset = offset - 1; 00169 gnSeqI endCrop = spec->GetLength() - real_offset; 00170 spec->CropEnd(endCrop); 00171 tmpSpec->CropStart(real_offset); 00172 //append the inserted sequence and the end of this sequence. 00173 insert(GNSEQI_END, gpbs); 00174 insert(GNSEQI_END, *tmpSpec); 00175 delete tmpSpec; 00176 STACK_TRACE_END 00177 } 00178 // Concatenation operators 00179 gnSequence const gnSequence::operator+(const gnSequence& seq) const{ 00180 STACK_TRACE_START 00181 gnSequence rval(*this); 00182 rval.append(seq); 00183 return rval; 00184 STACK_TRACE_END 00185 } 00186 // Subsequences and base deletion 00187 gnSequence gnSequence::subseq(const gnSeqI offset, const gnSeqI length) const{ 00188 STACK_TRACE_START 00189 if(length == 0) 00190 return gnSequence(); 00191 if(offset == 0) 00192 Throw_gnEx(SeqIndexOutOfBounds()); 00193 00194 gnSequence tmpSeq; 00195 delete tmpSeq.spec; 00196 tmpSeq.spec = spec->CloneRange(offset - 1, length); 00197 return tmpSeq; 00198 STACK_TRACE_END 00199 } 00200 void gnSequence::erase( const gnSeqI offset, const gnSeqI len ){ 00201 STACK_TRACE_START 00202 if(offset == 0) 00203 Throw_gnEx(SeqIndexOutOfBounds()); 00204 //range checking, etc. 00205 gnSeqI current_length = length(); 00206 if(offset > current_length) 00207 Throw_gnEx(SeqIndexOutOfBounds()); 00208 gnSeqI endBase = offset + len - 1 < current_length ? offset + len - 1 : current_length; 00209 gnSeqI startBase = offset - 1; 00210 00211 gnGenomeSpec* tmpSpec = spec->CloneRange(endBase, GNSEQI_END); 00212 uint32 lennard = tmpSpec->GetLength(); 00213 spec->CropEnd(current_length - startBase); 00214 lennard = length(); 00215 insert(GNSEQI_END, *tmpSpec); 00216 delete tmpSpec; 00217 STACK_TRACE_END 00218 } 00219 void gnSequence::splitContig(const gnSeqI splitI, const uint32 contigI){ 00220 STACK_TRACE_START 00221 gnSeqI splitBase = splitI; 00222 gnSeqI current_length = length(); 00223 if(splitI == 0) 00224 Throw_gnEx(SeqIndexOutOfBounds()); 00225 if(contigI == ALL_CONTIGS && splitI > current_length) 00226 Throw_gnEx(SeqIndexOutOfBounds()); 00227 else 00228 localToGlobal(contigI, splitBase); 00229 gnGenomeSpec* tmpSpec = spec->Clone(); 00230 tmpSpec->CropStart(splitBase); 00231 spec->CropEnd(current_length - splitBase); 00232 00233 insert(GNSEQI_END, *tmpSpec); 00234 delete tmpSpec; 00235 STACK_TRACE_END 00236 } 00237 00238 // IO operators 00239 istream& operator>>(istream& is, gnSequence& gps){ //read from source. 00240 string bases; 00241 is >> bases; 00242 gps.append(bases); 00243 return is; 00244 } 00245 ostream& operator<<(ostream& os, const gnSequence& gps){ //write to source. 00246 os << gps.ToString(); 00247 return os; 00248 } 00249 string gnSequence::ToString( const gnSeqI len, const gnSeqI offset ) const 00250 { 00251 STACK_TRACE_START 00252 string str; 00253 ToString(str, len, offset); 00254 return str; 00255 STACK_TRACE_END 00256 } 00257 boolean gnSequence::ToString( string& str, const gnSeqI len, const gnSeqI offset ) const 00258 { 00259 STACK_TRACE_START 00260 gnSeqI real_offset = offset - 1; 00261 gnSeqI m_length = length(); 00262 gnSeqI readSize = len > m_length - real_offset ? m_length - real_offset : len; 00263 Array<char> array_buf( readSize+1 ); 00264 char *buf = array_buf.data; 00265 boolean success = spec->SeqRead( real_offset, buf, readSize, ALL_CONTIGS); 00266 buf[readSize] = '\0'; 00267 str = buf; 00268 00269 //now filter the string 00270 list<const gnBaseFilter*>::const_iterator iter = filter_list.begin(); 00271 for(; iter != filter_list.end(); iter++) 00272 (*iter)->Filter(str); 00273 00274 if( success ) 00275 return true; 00276 return false; 00277 STACK_TRACE_END 00278 } 00279 boolean gnSequence::ToArray( gnSeqC* pSeqC, gnSeqI length, const gnSeqI offset ) const 00280 { 00281 STACK_TRACE_START 00282 gnSeqI real_offset = offset - 1; 00283 if(offset == GNSEQI_END) 00284 return false; 00285 gnSeqC* tmp = new gnSeqC[length]; 00286 boolean success = spec->SeqRead( real_offset, tmp, length, ALL_CONTIGS); 00287 00288 //now filter the array 00289 list<const gnBaseFilter*>::const_iterator iter = filter_list.begin(); 00290 gnSeqC** tomp = &tmp; 00291 for(; iter != filter_list.end(); iter++) 00292 (*iter)->Filter(tomp, length); 00293 memcpy(pSeqC, *tomp, length); 00294 delete[] *tomp; 00295 00296 if( success ) 00297 return true; 00298 00299 return false; 00300 STACK_TRACE_END 00301 } 00302 gnSeqC gnSequence::GetSeqC( const gnSeqI offset ) const 00303 { 00304 STACK_TRACE_START 00305 char block; 00306 gnSeqI readLen = 1; 00307 gnSeqI real_offset = offset - 1; 00308 boolean success = spec->SeqRead( real_offset, &block, readLen, ALL_CONTIGS); 00309 00310 //now filter the char 00311 list<const gnBaseFilter*>::const_iterator iter = filter_list.begin(); 00312 for(; iter != filter_list.end(); iter++) 00313 block = (*iter)->Filter(block); 00314 00315 if( success ) 00316 return block; 00317 00318 return GNSEQC_NULL; 00319 STACK_TRACE_END 00320 } 00321 gnSeqC gnSequence::operator[]( const gnSeqI offset ) const 00322 { 00323 STACK_TRACE_START 00324 return GetSeqC( offset ); 00325 STACK_TRACE_END 00326 } 00327 00328 gnSeqI gnSequence::contigListSize() const{ 00329 STACK_TRACE_START 00330 return spec->GetSpecListLength(); 00331 STACK_TRACE_END 00332 } 00333 gnSeqI gnSequence::contigListLength() const{ 00334 STACK_TRACE_START 00335 return spec->GetSpecListLength(); 00336 STACK_TRACE_END 00337 } 00338 00339 //find the contig associated with base 00340 uint32 gnSequence::contigIndexByBase( const gnSeqI baseI) const{ 00341 STACK_TRACE_START 00342 return spec->GetSpecIndexByBase(baseI-1); 00343 STACK_TRACE_END 00344 } 00345 gnSequence gnSequence::contig( const uint32 contigI) const{ 00346 STACK_TRACE_START 00347 if(contigI == ALL_CONTIGS) 00348 return *this; 00349 return gnSequence(*spec->GetSpec(contigI)); 00350 STACK_TRACE_END 00351 } 00352 //returns a gnSequence pointer containing the specified contig. 00353 gnSequence gnSequence::contigByBase( const gnSeqI baseI) const{ 00354 STACK_TRACE_START 00355 return gnSequence(*spec->GetSpecByBase(baseI-1)); 00356 STACK_TRACE_END 00357 } 00358 uint32 gnSequence::contigIndexByName( string& contigName) const{ 00359 STACK_TRACE_START 00360 return spec->GetSpecIndexByName(contigName); 00361 STACK_TRACE_END 00362 } 00363 00364 gnSeqI gnSequence::contigStart( const uint32 contigI) const{ 00365 STACK_TRACE_START 00366 int64 leafI = contigI; 00367 // add one for the stupid geneticists whose indices start at 1 00368 return spec->GetSpecStartBase( leafI ) + 1; 00369 STACK_TRACE_END 00370 } 00371 00372 gnSeqI gnSequence::contigLength( const uint32 contigI) const{ 00373 STACK_TRACE_START 00374 return spec->GetSpec( contigI )->GetLength(); 00375 STACK_TRACE_END 00376 } 00377 00378 string gnSequence::contigName( const uint32 contigI) const{ 00379 STACK_TRACE_START 00380 return spec->GetSpec(contigI)->GetName(); 00381 STACK_TRACE_END 00382 } 00383 00384 void gnSequence::globalToLocal(uint32& contigI, gnSeqI& baseI) const{ 00385 STACK_TRACE_START 00386 contigI = contigIndexByBase(baseI); 00387 baseI -= (contigStart(contigI) - 1); 00388 STACK_TRACE_END 00389 } 00390 00391 void gnSequence::localToGlobal(const uint32 contigI, gnSeqI& baseI) const{ 00392 STACK_TRACE_START 00393 if(baseI > contigLength(contigI)) 00394 Throw_gnEx(SeqIndexOutOfBounds()); 00395 baseI += contigStart(contigI) - 1; 00396 STACK_TRACE_END 00397 } 00398 00399 void gnSequence::globalToSource(uint32& contigI, gnSeqI& baseI) const{ 00400 STACK_TRACE_START 00401 baseI--; //convert from geneticist coordinates 00402 gnSeqI seq_contig = spec->GetSpecIndexByBase(baseI); 00403 gnSeqI new_base = baseI - spec->GetSpecStartBase(seq_contig); 00404 gnFragmentSpec *fragment_spec = spec->GetSpec(seq_contig); 00405 seq_contig = fragment_spec->GetSpecIndexByBase(new_base); 00406 new_base = new_base - fragment_spec->GetSpecStartBase(seq_contig); 00407 gnContigSpec* contig_spec = fragment_spec->GetSpec(seq_contig); 00408 contigI = contig_spec->GetSourceContigIndex(); 00409 gnSeqI contig_start = contig_spec->GetStart(); 00410 if(contig_spec->IsReverseComplement()){ 00411 gnSeqI source_len = contig_spec->GetSourceLength(); 00412 //it may seem counter intuitive, but we can add the source_len and mod by source_len 00413 //to deal with circularity 00414 // contig_start = (contig_start - contig_spec->GetLength() + source_len) % source_len; 00415 //if we are reverse complement then the contig_start will be the ending base pair 00416 //we want to subtract new_base from it. and we can add/mod by source_len to deal 00417 //with circles. 00418 baseI = (contig_start - new_base - 1 + source_len) % source_len; 00419 }else 00420 baseI = contig_start + new_base + 1; 00421 STACK_TRACE_END 00422 } 00423 00424 void gnSequence::localToSource(uint32& contigI, gnSeqI& baseI) const{ 00425 STACK_TRACE_START 00426 localToGlobal(contigI, baseI); 00427 globalToSource(contigI, baseI); 00428 STACK_TRACE_END 00429 } 00430 00431 gnSequence gnSequence::contigByName( string& contigName) const{ 00432 STACK_TRACE_START 00433 uint32 contigIndex = spec->GetSpecIndexByName(contigName); 00434 return gnSequence(*spec->GetSpec(contigIndex)); 00435 STACK_TRACE_END 00436 } 00437 00438 void gnSequence::setContigName( const uint32 contigI, const string& contig_name) { 00439 STACK_TRACE_START 00440 if(contigI == ALL_CONTIGS) 00441 spec->SetName(contig_name); 00442 else 00443 spec->GetSpec(contigI)->SetName(contig_name); 00444 STACK_TRACE_END 00445 } 00446 00447 void gnSequence::setReverseComplement( const boolean revComp, const uint32 contigI){ 00448 STACK_TRACE_START 00449 if(contigI == ALL_CONTIGS) 00450 spec->SetReverseComplement(revComp); 00451 else{ 00452 gnFragmentSpec* contig = spec->GetSpec(contigI); 00453 contig->SetReverseComplement(revComp); 00454 } 00455 STACK_TRACE_END 00456 } 00457 00458 boolean gnSequence::isReverseComplement( const uint32 contigI ){ 00459 STACK_TRACE_START 00460 if(contigI == ALL_CONTIGS) 00461 return spec->IsReverseComplement(); 00462 gnFragmentSpec* contig = spec->GetSpec(contigI); 00463 return contig->IsReverseComplement(); 00464 STACK_TRACE_END 00465 } 00466 00467 uint32 gnSequence::getHeaderListLength(const uint32 contigI) const{ 00468 STACK_TRACE_START 00469 if(contigI == ALL_CONTIGS) 00470 return spec->GetHeaderListLength(); 00471 else{ 00472 return spec->GetSpec(contigI)->GetHeaderListLength(); 00473 } 00474 STACK_TRACE_END 00475 } 00476 00477 gnBaseHeader* gnSequence::getHeader(const uint32 contigI, const uint32 headerI) const{ 00478 STACK_TRACE_START 00479 if(contigI == ALL_CONTIGS) 00480 return spec->GetHeader(headerI); 00481 else{ 00482 return spec->GetSpec(contigI)->GetHeader(headerI); 00483 } 00484 STACK_TRACE_END 00485 } 00486 00487 void gnSequence::addHeader(const uint32 contigI, gnBaseHeader* header, const uint32 headerI){ 00488 STACK_TRACE_START 00489 if(contigI == ALL_CONTIGS) 00490 spec->AddHeader(header, headerI); 00491 else{ 00492 spec->GetSpec(contigI)->AddHeader(header, headerI); 00493 } 00494 STACK_TRACE_END 00495 } 00496 00497 void gnSequence::removeHeader(const uint32 contigI, const uint32 headerI){ 00498 STACK_TRACE_START 00499 if(contigI == ALL_CONTIGS) 00500 spec->RemoveHeader(headerI); 00501 else 00502 spec->GetSpec(contigI)->RemoveHeader(headerI); 00503 STACK_TRACE_END 00504 } 00505 00506 void gnSequence::merge(const gnSeqI startBase, const gnSeqI endBase){ 00507 STACK_TRACE_START 00508 STACK_TRACE_END 00509 } 00510 00511 void gnSequence::mergeContigs(const uint32 startC, const uint32 endC){ 00512 STACK_TRACE_START 00513 spec->MergeFragments(startC, endC); 00514 STACK_TRACE_END 00515 } 00516 00517 bool gnSequence::LoadSource(const string sourcename){ 00518 STACK_TRACE_START 00519 gnSourceFactory* m_sourceFactory = gnSourceFactory::GetSourceFactory(); 00520 gnBaseSource *m_pSource = m_sourceFactory->AddSource(sourcename, true); 00521 if (m_pSource!=NULL) 00522 { 00523 if(spec != NULL) 00524 delete spec; 00525 spec = m_pSource->GetSpec(); 00526 return true; 00527 } 00528 return false; 00529 STACK_TRACE_END 00530 } 00531 00532 gnSeqI gnSequence::find(const gnSequence& search, const gnSeqI offset)const{ 00533 STACK_TRACE_START 00534 //this is really adHoc... should probably be switched 00535 string searchIn=ToString(); 00536 string find= search.ToString(); 00537 string::size_type pos = searchIn.find(find, offset); 00538 if (pos == string::npos) 00539 return GNSEQI_ERROR; 00540 else{ 00541 return pos++; 00542 } 00543 STACK_TRACE_END 00544 }

Generated on Mon Feb 14 19:28:20 2005 for libGenome by doxygen 1.3.8