00001
#include "gn/gnSequence.h"
00002
#include "gn/gnGBKSource.h"
00003
#include "gn/gnFASSource.h"
00004
#include <algorithm>
00005
00006
static boolean LocationLessthan(
const gnLocation& a,
const gnLocation& b);
00007
00008 static boolean LocationLessthan(
const gnLocation& a,
const gnLocation& b){
00009
return a.
GetStart() < b.
GetStart();
00010 }
00011
00012
static boolean LocationEndLessthan(
const gnLocation& a,
const gnLocation& b);
00013
00014 static boolean LocationEndLessthan(
const gnLocation& a,
const gnLocation& b){
00015
return a.
GetEnd() < b.
GetStart();
00016 }
00017
00018
static boolean LocationSizeLessthan(
const gnLocation& a,
const gnLocation& b);
00019
00020 static boolean LocationSizeLessthan(
const gnLocation& a,
const gnLocation& b){
00021
return a.
GetEnd() - a.
GetStart() < b.
GetEnd() - b.
GetStart();
00022 }
00023
00024
00025
00026 void ComplementLocation(vector<gnLocation>& l_list){
00027
if(l_list.size() < 2)
00028
return;
00029
00030 vector<gnLocation> comp_list;
00031
00032 sort(l_list.begin(), l_list.end(), &
LocationLessthan);
00033
00034
for(
uint32 locationI = 1; locationI < l_list.size(); locationI++){
00035
00036
00037
if( !l_list[locationI].Intersects( l_list[locationI-1] ) ){
00038
00039
00040
gnLocation between_loc;
00041 between_loc.
SetStart(l_list[locationI-1].GetEnd());
00042 between_loc.
SetEnd(l_list[locationI].GetStart());
00043
00044 comp_list.push_back(between_loc);
00045 }
00046
00047 }
00048
00049 l_list = comp_list;
00050 }
00051
00052 void DeleteRepeats(list<gnLocation>& loc_list){
00053
00054 loc_list.sort(&
LocationLessthan);
00055
00056 list<gnLocation>::iterator loc_iter = loc_list.begin();
00057 list<gnLocation>::iterator cur_loc;
00058
for(; loc_iter != loc_list.end(); loc_iter++){
00059 cur_loc = loc_iter;
00060 cur_loc++;
00061
while(cur_loc != loc_list.end() && !
LocationEndLessthan(*loc_iter, *cur_loc)){
00062
if(loc_iter->Intersects(*cur_loc)){
00063 *loc_iter = loc_iter->GetUnion(*cur_loc);
00064 list<gnLocation>::iterator to_del = cur_loc;
00065 cur_loc--;
00066 loc_list.erase(to_del);
00067 }
00068 cur_loc++;
00069 }
00070 }
00071 }
00072
00073 void WriteData(
gnSequence& file_sequence, list<gnLocation>& loc_list, string& filename){
00074
gnSequence loc_seq;
00075 cout <<
"Gathering sequence data to write...\n";
00076 list<gnLocation>::iterator loc_iter = loc_list.begin();
00077
uint32 loc_count = loc_list.size();
00078
uint32 notice_interval = loc_count / 10;
00079
uint32 cur_locI = 0;
00080
for(;loc_iter != loc_list.end(); loc_iter++){
00081 loc_seq += file_sequence.
ToString( loc_iter->GetEnd() - loc_iter->GetStart(), loc_iter->GetStart());
00082
if(cur_locI % notice_interval == 0){
00083 cout << (cur_locI / loc_count) * 10 <<
"%..";
00084 }
00085 cur_locI++;
00086 }
00087 cout <<
"\nWriting sequence data...\n";
00088
gnFASSource::Write(loc_seq, filename);
00089 }
00090
00091 void WriteList(list<gnLocation>& loc_list, string& filename){
00092 ofstream list_out(filename.c_str());
00093 list<gnLocation>::iterator loc_iter = loc_list.begin();
00094
for(;loc_iter != loc_list.end(); loc_iter++){
00095 list_out << loc_iter->GetStart() <<
'\t' << loc_iter->GetEnd() <<
'\n';
00096 }
00097 list_out.close();
00098 }
00099
00100 void print_usage(
char* pname){
00101 cout <<
"Usage: " << pname <<
" <genbank file> <exon_list> <intron_list> <exon_seq> <intron_seq>\n";
00102 }
00103
00104 int main(
int argc,
char* argv[] )
00105 {
00106
boolean run_interactive =
false;
00107 string seq_filename;
00108 string exon_list_filename;
00109 string intron_list_filename;
00110 string exon_seq_filename;
00111 string intron_seq_filename;
00112
00113
gnSequence file_sequence;
00114
00115
if(!run_interactive){
00116
00117
if(argc != 6){
00118
print_usage(argv[0]);
00119
return -1;
00120 }
00121
00122 seq_filename = argv[1];
00123 exon_list_filename = argv[2];
00124 intron_list_filename = argv[3];
00125 exon_seq_filename = argv[4];
00126 intron_seq_filename = argv[5];
00127 }
else{
00128
00129
00130 cout <<
"Enter the name of the sequence file to load:\n" ;
00131 cin >> seq_filename;
00132 }
00133
00134
00135
if(file_sequence.
LoadSource(seq_filename))
00136 {
00137 cout << seq_filename <<
" loaded successfully. " << file_sequence.
length() <<
" base pairs.\n";
00138 }
else{
00139 cout <<
"Error loading file.\n";
00140
return -1;
00141 }
00142
00143
uint32 feature_count = file_sequence.
getFeatureListLength();
00144
uint32 cds_count = 0;
00145
00146
00147
gnSequence exon_seq;
00148
gnSequence intron_seq;
00149
00150 list<gnLocation> exon_list;
00151 list<gnLocation> intron_list;
00152
00153 cout <<
"There are " << feature_count <<
" known features.\n";
00154
for(
uint32 featureI = 0; featureI < feature_count; featureI++){
00155
gnBaseFeature* cur_feat = file_sequence.
getFeature(featureI);
00156
if(cur_feat == NULL){
00157 cout <<
"cur_feat is NULL, trace me!\n";
00158 cur_feat = file_sequence.
getFeature(featureI);
00159
continue;
00160 }
00161
if(cur_feat->
GetName() ==
"CDS"){
00162 cds_count++;
00163
uint32 loc_count = cur_feat->
GetLocationListLength();
00164 vector<gnLocation> cur_exon_list, cur_intron_list;
00165
00166
for(
uint32 locationI = 0; locationI < loc_count; locationI++){
00167 cur_exon_list.push_back(cur_feat->
GetLocation(locationI));
00168 exon_list.push_back(cur_exon_list[locationI]);
00169 }
00170
if(loc_count >= 2){
00171 cur_intron_list = cur_exon_list;
00172
ComplementLocation(cur_intron_list);
00173
for(
uint32 locationI = 0; locationI < cur_intron_list.size(); locationI++)
00174 intron_list.push_back(cur_intron_list[locationI]);
00175 }
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
delete cur_feat;
00190 }
00191 }
00192
00193 cout <<
"There are " << cds_count <<
" cds features.\n";
00194
00195
00196
uint32 exon_count = exon_list.size();
00197 cout <<
"Deleting exon repeats... ";
00198
DeleteRepeats(exon_list);
00199 cout <<
"Eliminated " << exon_count - exon_list.size() <<
" repeats for a total of ";
00200 cout << exon_list.size() <<
" repeats\n";
00201
uint32 intron_count = intron_list.size();
00202 cout <<
"Deleting intron repeats... ";
00203
DeleteRepeats(intron_list);
00204 cout <<
"Eliminated " << intron_count - intron_list.size() <<
" repeats for a total of ";
00205 cout << intron_list.size() <<
" repeats\n";
00206
WriteList(exon_list, exon_list_filename);
00207
WriteList(intron_list, intron_list_filename);
00208
WriteData(file_sequence, exon_list, exon_seq_filename);
00209
WriteData(file_sequence, intron_list, intron_seq_filename);
00210
if(run_interactive)
00211 cin >> seq_filename;
00212
return 0;
00213 };