00001
#include "gn/gnSequence.h"
00002
#include <iostream>
00003
#include <fstream>
00004
#include <algorithm>
00005
00006 struct ExMem{
00007 gnSeqI length;
00008 int64 ex_start;
00009 int64 in_start;
00010
00011 };
00012
00013
static boolean LocationLessthan(
const gnLocation& a,
const gnLocation& b);
00014
00015 static boolean LocationLessthan(
const gnLocation& a,
const gnLocation& b){
00016
return a.
GetStart() < b.
GetStart();
00017 }
00018
00019
static boolean LocationEndLessthan(
const gnLocation& a,
const gnLocation& b);
00020
00021 static boolean LocationEndLessthan(
const gnLocation& a,
const gnLocation& b){
00022
return a.
GetEnd() < b.
GetStart();
00023 }
00024
00025
static boolean LocationSizeLessthan(
const gnLocation& a,
const gnLocation& b);
00026
00027 static boolean LocationSizeLessthan(
const gnLocation& a,
const gnLocation& b){
00028
return a.
GetEnd() - a.
GetStart() < b.
GetEnd() - b.
GetStart();
00029 }
00030
00031 static boolean ExMemLessthan(
const ExMem& a,
const ExMem& b){
00032
if(a.
ex_start == b.
ex_start){
00033
if(a.
in_start == b.
in_start){
00034
return a.
length < b.
length;
00035 }
00036
return a.
in_start < b.
in_start;
00037 }
00038
return a.
ex_start < b.
ex_start;
00039 }
00040
00041 void print_usage(
char* pname){
00042 cout <<
"Usage: " << pname <<
" <genbank file> <exon_list> <intron_list> <mem_list> <regulation_network> <minimum match size>\n";
00043 }
00044
00045 void load_map_file(string& filename, vector<gnLocation>& loc_list){
00046 ifstream coord_file;
00047 coord_file.open(filename.c_str());
00048
if(!coord_file.is_open()){
00049 cout <<
"couldn't open file\n";
00050
return;
00051 }
00052
00053
00054
while(coord_file.good()){
00055
gnSeqI start_coord, end_coord;
00056 coord_file >> start_coord;
00057 coord_file >> end_coord;
00058
00059
gnLocation new_location(start_coord, end_coord);
00060 loc_list.push_back(new_location);
00061 }
00062 }
00063
00064 void map_coordinates(vector<gnLocation>& loc_list, vector<gnLocation>& map_list){
00065
gnSeqI curStart = 1;
00066
for(
uint32 locationI = 0; locationI < loc_list.size(); locationI++){
00067
gnSeqI cur_len = loc_list[locationI].GetEnd() - loc_list[locationI].GetStart() + 1;
00068
gnLocation map_location(curStart, curStart+cur_len-1);
00069 map_list.push_back(map_location);
00070 curStart += cur_len;
00071 }
00072 }
00073
00074 void map_list_coordinates(list<gnLocation>& loc_list, list<gnLocation>& map_list){
00075
gnSeqI curStart = 1;
00076 list<gnLocation>::iterator loc_iter = loc_list.begin();
00077
for(; loc_iter != loc_list.end(); loc_iter++){
00078
gnSeqI cur_len = loc_iter->GetEnd() - loc_iter->GetStart() + 1;
00079
gnLocation map_location(curStart, curStart+cur_len-1);
00080 map_list.push_back(map_location);
00081 curStart += cur_len;
00082 }
00083 }
00084
00085 void print_feature(ostream& os,
gnBaseFeature* cur_feat){
00086 os << cur_feat->
GetName() <<
": \n";
00087
for(
uint32 i=0; i < cur_feat->
GetQualifierListLength(); i++)
00088 os << cur_feat->
GetQualifierName(i) <<
"\t" << cur_feat->
GetQualifierValue(i) <<
"\n";
00089 os <<
"\n";
00090
00091 }
00092
00093 uint32 loc_binary_search(vector<gnLocation>& loc_list,
uint32 startI,
uint32 endI,
gnLocation& query_loc){
00094
uint32 midI = ((endI - startI) / 2) + startI;
00095
if(startI == endI)
00096
return endI;
00097
00098
if(loc_list[midI].Intersects(query_loc)){
00099
return midI;
00100 }
else if(loc_list[midI].GetStart() < query_loc.
GetStart())
00101
return loc_binary_search(loc_list, midI + 1, endI, query_loc);
00102
else
00103
return loc_binary_search(loc_list, startI, midI, query_loc);
00104 }
00105
00106 int main(
int argc,
char* argv[]){
00107
00108
boolean run_interactive =
false;
00109 string seq_filename;
00110 string exon_list_filename;
00111 string intron_list_filename;
00112 string mem_list_filename;
00113 string reg_net_filename;
00114 vector<gnLocation> exon_list;
00115 vector<gnLocation> intron_list;
00116 vector<gnLocation> exon_map_list;
00117 vector<gnLocation> intron_map_list;
00118 vector<ExMem> mem_list;
00119
uint32 minimum_match_size;
00120
00121
00122
if(argc != 7){
00123
print_usage(argv[0]);
00124
return -1;
00125 }
00126
00127 seq_filename = argv[1];
00128 exon_list_filename = argv[2];
00129 intron_list_filename = argv[3];
00130 mem_list_filename = argv[4];
00131 reg_net_filename = argv[5];
00132 minimum_match_size = atoi(argv[6]);
00133
00134 ifstream mem_file(mem_list_filename.c_str());
00135
if(!mem_file.is_open()){
00136 cout <<
"Error opening " << mem_list_filename <<
"\n";
00137
return -1;
00138 }
00139
00140
if(run_interactive){
00141 cout <<
"Give the name of the exon list to search\n";
00142 cin >> exon_list_filename;
00143 cout <<
"Give the name of the intron list to search\n";
00144 cin >> intron_list_filename;
00145 cout <<
"Give the name of the regulatory network to output\n";
00146 cin >> reg_net_filename;
00147
00148 }
00149 ofstream net_file(reg_net_filename.c_str());
00150
00151
if(!net_file.is_open()){
00152 cout <<
"Error opening regulatory network file: " << reg_net_filename <<
"\n";
00153
return -2;
00154 }
00155
00156
load_map_file(exon_list_filename, exon_list);
00157
load_map_file(intron_list_filename, intron_list);
00158
00159 cout << exon_list.size() <<
" unique exons loaded from file\n";
00160 cout << intron_list.size() <<
" unique introns loaded from file\n";
00161
00162
00163
gnSequence seq_file;
00164
if(run_interactive){
00165 cout <<
"Enter the name of the genbank sequence file you are using\n";
00166 cin >> seq_filename;
00167 }
00168
if(!seq_file.
LoadSource(seq_filename)){
00169 cout <<
"Error loading file\n";
00170
return -1;
00171 }
00172 cout <<
"Sequence loaded successfully, " << seq_file.
length() <<
" base pairs.\n";
00173
00174
00175
map_coordinates(exon_list, exon_map_list);
00176
map_coordinates(intron_list, intron_map_list);
00177
00178
00179
while(mem_file.good()){
00180
ExMem m;
00181 mem_file >> m.
length;
00182 mem_file >> m.
ex_start;
00183 mem_file >> m.
in_start;
00184
if(m.
length >= minimum_match_size)
00185 mem_list.push_back(m);
00186 }
00187 cout << mem_list.size() <<
" matches loaded.\n";
00188
00189 sort(mem_list.begin(), mem_list.end(), &
ExMemLessthan);
00190
00191
00192
uint32 exonmapI = 0;
00193
uint32 notify_percent = 10;
00194
uint32 notify_interval = mem_list.size() / notify_percent;
00195
uint32 percent_count = 0;
00196 cout <<
"Searching for complementary matches:\n";
00197
for(
uint32 memI = 0; memI < mem_list.size(); memI++){
00198
if(memI % notify_interval == 0){
00199 cout << percent_count <<
"%.. ";
00200 percent_count += notify_percent;
00201 }
00202
00203
gnLocation ex_map_loc(mem_list[memI].ex_start, mem_list[memI].ex_start + mem_list[memI].length - 1);
00204
for(; exonmapI < exon_map_list.size(); exonmapI++){
00205
if(exon_map_list[exonmapI].Intersects(ex_map_loc))
00206
break;
00207 }
00208
00209
00210
uint32 mapEnd = exonmapI;
00211
for(; mapEnd < exon_map_list.size(); mapEnd++){
00212
if(!exon_map_list[exonmapI].Intersects(ex_map_loc))
00213
break;
00214 }
00215 mapEnd--;
00216
00217
uint32 intronmapI = 0;
00218
00219
int64 cur_in_start = mem_list[memI].in_start;
00220
if(cur_in_start < 0)
00221 cur_in_start = -cur_in_start;
00222
gnLocation in_map_loc(cur_in_start, cur_in_start + mem_list[memI].length - 1);
00223
uint32 search_mapI =
loc_binary_search(intron_map_list, 0, intron_map_list.size()-1, in_map_loc);
00224 intronmapI = search_mapI;
00225
00226
00227
for(; intronmapI >= 0; intronmapI--){
00228
if(!intron_map_list[intronmapI].Intersects(in_map_loc)){
00229 intronmapI++;
00230
break;
00231 }
00232
if(intronmapI == 0)
00233
break;
00234 }
00235
00236
uint32 intron_mapEnd = search_mapI;
00237
for(; intron_mapEnd < intron_map_list.size(); intron_mapEnd++){
00238
if(!intron_map_list[intronmapI].Intersects(in_map_loc))
00239
break;
00240 }
00241 intron_mapEnd--;
00242
00243
00244 vector<uint32> ex_feat_index, in_feat_index;
00245 vector<gnBaseFeature*> ex_feat_list;
00246 vector<gnBaseFeature*> in_feat_list;
00247
gnSeqI cur_match_len = mem_list[memI].length;
00248
00249
00250
00251
gnSeqI extra_exon_start = mem_list[memI].ex_start - exon_map_list[exonmapI].GetStart();
00252
gnSeqI cur_exon_len = exon_map_list[exonmapI].GetEnd() - exon_map_list[exonmapI].GetStart() + 1;
00253
gnSeqI max_exon_chunk = cur_exon_len - extra_exon_start;
00254
gnSeqI cur_exon_chunk = max_exon_chunk < cur_match_len ? max_exon_chunk : cur_match_len;
00255
00256
00257
gnSeqI extra_intron_start, cur_intron_len, max_intron_chunk, cur_intron_chunk;
00258
boolean complement =
false;
00259
if(mem_list[memI].in_start > 0){
00260 extra_intron_start = mem_list[memI].in_start - intron_map_list[intronmapI].GetStart();
00261 cur_intron_len = intron_map_list[intronmapI].GetEnd() - intron_map_list[intronmapI].GetStart() + 1;
00262 max_intron_chunk = cur_intron_len - extra_intron_start;
00263 cur_intron_chunk = max_intron_chunk < mem_list[memI].length ? max_intron_chunk : mem_list[memI].length;
00264 }
else{
00265
00266
if(cur_in_start >= intron_map_list[intron_mapEnd].GetStart())
00267 cur_intron_chunk = cur_match_len;
00268
else
00269 cur_intron_chunk = cur_in_start + cur_match_len - intron_map_list[intron_mapEnd].GetStart();
00270 complement =
true;
00271 seq_file.
getIntersectingFeatures(intron_list[intronmapI], in_feat_list, in_feat_index);
00272 }
00273
00274
00275
gnSeqI cur_chunk = cur_intron_chunk < cur_exon_chunk ? cur_intron_chunk : cur_exon_chunk;
00276
00277
gnLocation cur_exon_loc(exon_list[exonmapI].GetStart() + extra_exon_start, exon_list[exonmapI].GetStart() + cur_chunk);
00278 seq_file.
getIntersectingFeatures(cur_exon_loc, ex_feat_list, ex_feat_index);
00279
if(mem_list[memI].in_start > 0){
00280
gnLocation cur_intron_loc(intron_list[intronmapI].GetStart() - 1, intron_list[intronmapI].GetEnd() + 1);
00281 seq_file.
getIntersectingFeatures(cur_intron_loc, in_feat_list, in_feat_index);
00282 }
else{
00283
gnLocation cur_intron_loc(intron_list[intron_mapEnd].GetStart() - 1, intron_list[intron_mapEnd].GetEnd() + 1);
00284 seq_file.
getIntersectingFeatures(cur_intron_loc, in_feat_list, in_feat_index);
00285 }
00286 vector<gnBaseFeature*> ex_forward;
00287 vector<gnBaseFeature*> ex_reverse;
00288 vector<gnBaseFeature*> in_forward;
00289 vector<gnBaseFeature*> in_reverse;
00290
00291
for(
uint32 featI = 0; featI < ex_feat_list.size(); featI++){
00292 string featName = ex_feat_list[featI]->GetName();
00293
if(featName ==
"mRNA" || featName ==
"CDS" || featName ==
"gene" )
00294
if(ex_feat_list[featI]->GetLocationType() == gnLocation::LT_Complement)
00295 ex_reverse.push_back(ex_feat_list[featI]);
00296
else
00297 ex_forward.push_back(ex_feat_list[featI]);
00298 }
00299
for(
uint32 featI = 0; featI < in_feat_list.size(); featI++){
00300 string featName = in_feat_list[featI]->GetName();
00301
if(featName ==
"mRNA" || featName ==
"CDS" || featName ==
"gene" )
00302
if(in_feat_list[featI]->GetLocationType() == gnLocation::LT_Complement)
00303 in_reverse.push_back(in_feat_list[featI]);
00304
else
00305 in_forward.push_back(in_feat_list[featI]);
00306 }
00307
00308
if(complement){
00309 vector<gnBaseFeature*> tmp_in = in_forward;
00310 in_forward = in_reverse;
00311 in_reverse = tmp_in;
00312 }
00313
00314
00315
if((ex_forward.size() > 0 && in_reverse.size() > 0) || (in_forward.size() > 0 && ex_reverse.size() > 0)){
00316 net_file <<
"================================\n";
00317 net_file <<
"Mem: " << mem_list[memI].length <<
"\n";
00318 net_file <<
"This exon/intron matching size: " << cur_chunk <<
"\n";
00319 }
00320
if(ex_forward.size() > 0 && in_reverse.size() > 0){
00321 net_file <<
"Forward Exons:\n";
00322
for(
uint32 featI = 0; featI < ex_forward.size(); featI++)
00323
print_feature(net_file, ex_forward[featI]);
00324 net_file <<
"Matching introns:\n";
00325
for(
uint32 featI = 0; featI < in_reverse.size(); featI++)
00326
print_feature(net_file, in_reverse[featI]);
00327 }
00328
if(in_forward.size() > 0 && ex_reverse.size() > 0){
00329 net_file <<
"Reverse Exons:\n";
00330
for(
uint32 featI = 0; featI < ex_reverse.size(); featI++)
00331
print_feature(net_file, ex_reverse[featI]);
00332 net_file <<
"Matching introns:\n";
00333
for(
uint32 featI = 0; featI < in_forward.size(); featI++)
00334
print_feature(net_file, in_forward[featI]);
00335 }
00336
00337
00338
for(
uint32 featI = 0; featI < ex_feat_list.size(); featI++)
00339
delete ex_feat_list[featI];
00340
for(
uint32 featI = 0; featI < in_feat_list.size(); featI++)
00341
delete in_feat_list[featI];
00342
00343
00344
00345
00346
00347 }
00348 }
00349