src/coordMapper.cpp

Go to the documentation of this file.
00001 #include "gn/gnSequence.h" 00002 #include <iostream> 00003 #include <fstream> 00004 #include <algorithm> 00005 00006 struct ExMem{ 00007 gnSeqI length; 00008 int64 ex_start; 00009 int64 in_start; 00010 00011 }; 00012 00013 static boolean LocationLessthan(const gnLocation& a, const gnLocation& b); 00014 00015 static boolean LocationLessthan(const gnLocation& a, const gnLocation& b){ 00016 return a.GetStart() < b.GetStart(); 00017 } 00018 00019 static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b); 00020 00021 static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b){ 00022 return a.GetEnd() < b.GetStart(); 00023 } 00024 00025 static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b); 00026 00027 static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b){ 00028 return a.GetEnd() - a.GetStart() < b.GetEnd() - b.GetStart(); 00029 } 00030 00031 static boolean ExMemLessthan(const ExMem& a, const ExMem& b){ 00032 if(a.ex_start == b.ex_start){ 00033 if(a.in_start == b.in_start){ 00034 return a.length < b.length; 00035 } 00036 return a.in_start < b.in_start; 00037 } 00038 return a.ex_start < b.ex_start; 00039 } 00040 00041 void print_usage(char* pname){ 00042 cout << "Usage: " << pname << " <genbank file> <exon_list> <intron_list> <mem_list> <regulation_network> <minimum match size>\n"; 00043 } 00044 00045 void load_map_file(string& filename, vector<gnLocation>& loc_list){ 00046 ifstream coord_file; 00047 coord_file.open(filename.c_str()); 00048 if(!coord_file.is_open()){ 00049 cout << "couldn't open file\n"; 00050 return; 00051 } 00052 00053 //read all the exon coordinates 00054 while(coord_file.good()){ 00055 gnSeqI start_coord, end_coord; 00056 coord_file >> start_coord; 00057 coord_file >> end_coord; 00058 00059 gnLocation new_location(start_coord, end_coord); 00060 loc_list.push_back(new_location); 00061 } 00062 } 00063 00064 void map_coordinates(vector<gnLocation>& loc_list, vector<gnLocation>& map_list){ 00065 gnSeqI curStart = 1; 00066 for(uint32 locationI = 0; locationI < loc_list.size(); locationI++){ 00067 gnSeqI cur_len = loc_list[locationI].GetEnd() - loc_list[locationI].GetStart() + 1; 00068 gnLocation map_location(curStart, curStart+cur_len-1); 00069 map_list.push_back(map_location); 00070 curStart += cur_len; 00071 } 00072 } 00073 00074 void map_list_coordinates(list<gnLocation>& loc_list, list<gnLocation>& map_list){ 00075 gnSeqI curStart = 1; 00076 list<gnLocation>::iterator loc_iter = loc_list.begin(); 00077 for(; loc_iter != loc_list.end(); loc_iter++){ 00078 gnSeqI cur_len = loc_iter->GetEnd() - loc_iter->GetStart() + 1; 00079 gnLocation map_location(curStart, curStart+cur_len-1); 00080 map_list.push_back(map_location); 00081 curStart += cur_len; 00082 } 00083 } 00084 00085 void print_feature(ostream& os, gnBaseFeature* cur_feat){ 00086 os << cur_feat->GetName() << ": \n"; 00087 for(uint32 i=0; i < cur_feat->GetQualifierListLength(); i++) 00088 os << cur_feat->GetQualifierName(i) << "\t" << cur_feat->GetQualifierValue(i) << "\n"; 00089 os << "\n"; 00090 00091 } 00092 00093 uint32 loc_binary_search(vector<gnLocation>& loc_list, uint32 startI, uint32 endI, gnLocation& query_loc){ 00094 uint32 midI = ((endI - startI) / 2) + startI; 00095 if(startI == endI) 00096 return endI; 00097 00098 if(loc_list[midI].Intersects(query_loc)){ 00099 return midI; 00100 }else if(loc_list[midI].GetStart() < query_loc.GetStart()) 00101 return loc_binary_search(loc_list, midI + 1, endI, query_loc); 00102 else 00103 return loc_binary_search(loc_list, startI, midI, query_loc); 00104 } 00105 00106 int main(int argc, char* argv[]){ 00107 00108 boolean run_interactive = false; 00109 string seq_filename; 00110 string exon_list_filename; 00111 string intron_list_filename; 00112 string mem_list_filename; 00113 string reg_net_filename; 00114 vector<gnLocation> exon_list; 00115 vector<gnLocation> intron_list; 00116 vector<gnLocation> exon_map_list; 00117 vector<gnLocation> intron_map_list; 00118 vector<ExMem> mem_list; 00119 uint32 minimum_match_size; 00120 00121 // check for correct calling semantics 00122 if(argc != 7){ 00123 print_usage(argv[0]); 00124 return -1; 00125 } 00126 00127 seq_filename = argv[1]; 00128 exon_list_filename = argv[2]; 00129 intron_list_filename = argv[3]; 00130 mem_list_filename = argv[4]; 00131 reg_net_filename = argv[5]; 00132 minimum_match_size = atoi(argv[6]); 00133 00134 ifstream mem_file(mem_list_filename.c_str()); 00135 if(!mem_file.is_open()){ 00136 cout << "Error opening " << mem_list_filename << "\n"; 00137 return -1; 00138 } 00139 00140 if(run_interactive){ 00141 cout << "Give the name of the exon list to search\n"; 00142 cin >> exon_list_filename; 00143 cout << "Give the name of the intron list to search\n"; 00144 cin >> intron_list_filename; 00145 cout << "Give the name of the regulatory network to output\n"; 00146 cin >> reg_net_filename; 00147 00148 } 00149 ofstream net_file(reg_net_filename.c_str()); 00150 00151 if(!net_file.is_open()){ 00152 cout << "Error opening regulatory network file: " << reg_net_filename << "\n"; 00153 return -2; 00154 } 00155 00156 load_map_file(exon_list_filename, exon_list); 00157 load_map_file(intron_list_filename, intron_list); 00158 00159 cout << exon_list.size() << " unique exons loaded from file\n"; 00160 cout << intron_list.size() << " unique introns loaded from file\n"; 00161 00162 //now load the genbank file 00163 gnSequence seq_file; 00164 if(run_interactive){ 00165 cout << "Enter the name of the genbank sequence file you are using\n"; 00166 cin >> seq_filename; 00167 } 00168 if(!seq_file.LoadSource(seq_filename)){ 00169 cout << "Error loading file\n"; 00170 return -1; 00171 } 00172 cout << "Sequence loaded successfully, " << seq_file.length() << " base pairs.\n"; 00173 00174 //construct a mapping between coordinates... 00175 map_coordinates(exon_list, exon_map_list); 00176 map_coordinates(intron_list, intron_map_list); 00177 00178 //now read the mem file 00179 while(mem_file.good()){ 00180 ExMem m; 00181 mem_file >> m.length; 00182 mem_file >> m.ex_start; 00183 mem_file >> m.in_start; 00184 if(m.length >= minimum_match_size) 00185 mem_list.push_back(m); 00186 } 00187 cout << mem_list.size() << " matches loaded.\n"; 00188 //sort the mem list 00189 sort(mem_list.begin(), mem_list.end(), &ExMemLessthan); 00190 00191 //now get the intersection for each mem in the list... 00192 uint32 exonmapI = 0; 00193 uint32 notify_percent = 10; 00194 uint32 notify_interval = mem_list.size() / notify_percent; 00195 uint32 percent_count = 0; 00196 cout << "Searching for complementary matches:\n"; 00197 for(uint32 memI = 0; memI < mem_list.size(); memI++){ 00198 if(memI % notify_interval == 0){ 00199 cout << percent_count << "%.. "; 00200 percent_count += notify_percent; 00201 } 00202 //simple linear search for intersecting exon mapping 00203 gnLocation ex_map_loc(mem_list[memI].ex_start, mem_list[memI].ex_start + mem_list[memI].length - 1); 00204 for(; exonmapI < exon_map_list.size(); exonmapI++){ 00205 if(exon_map_list[exonmapI].Intersects(ex_map_loc)) 00206 break; 00207 } 00208 00209 //continue to search for any other mappings that intersect 00210 uint32 mapEnd = exonmapI; 00211 for(; mapEnd < exon_map_list.size(); mapEnd++){ 00212 if(!exon_map_list[exonmapI].Intersects(ex_map_loc)) 00213 break; 00214 } 00215 mapEnd--; 00216 00217 uint32 intronmapI = 0; // intronmapI will contain the index of the first intersecting intron 00218 //do a binary search for intersecting intron mappings 00219 int64 cur_in_start = mem_list[memI].in_start; 00220 if(cur_in_start < 0) 00221 cur_in_start = -cur_in_start; 00222 gnLocation in_map_loc(cur_in_start, cur_in_start + mem_list[memI].length - 1); 00223 uint32 search_mapI = loc_binary_search(intron_map_list, 0, intron_map_list.size()-1, in_map_loc); 00224 intronmapI = search_mapI; 00225 00226 //search backwards for previous intersections 00227 for(; intronmapI >= 0; intronmapI--){ 00228 if(!intron_map_list[intronmapI].Intersects(in_map_loc)){ 00229 intronmapI++; 00230 break; 00231 } 00232 if(intronmapI == 0) 00233 break; 00234 } 00235 //continue to search for any other mappings that intersect 00236 uint32 intron_mapEnd = search_mapI; 00237 for(; intron_mapEnd < intron_map_list.size(); intron_mapEnd++){ 00238 if(!intron_map_list[intronmapI].Intersects(in_map_loc)) 00239 break; 00240 } 00241 intron_mapEnd--; 00242 00243 //we have the mappings, now map the coordinates 00244 vector<uint32> ex_feat_index, in_feat_index; 00245 vector<gnBaseFeature*> ex_feat_list; 00246 vector<gnBaseFeature*> in_feat_list; 00247 gnSeqI cur_match_len = mem_list[memI].length; 00248 00249 //find out how much of the first exon was matched 00250 //extra exon start has the number of unmatched bases at the beginning of the exon 00251 gnSeqI extra_exon_start = mem_list[memI].ex_start - exon_map_list[exonmapI].GetStart(); 00252 gnSeqI cur_exon_len = exon_map_list[exonmapI].GetEnd() - exon_map_list[exonmapI].GetStart() + 1; 00253 gnSeqI max_exon_chunk = cur_exon_len - extra_exon_start; 00254 gnSeqI cur_exon_chunk = max_exon_chunk < cur_match_len ? max_exon_chunk : cur_match_len; 00255 00256 //find out how much of the first intron was matched 00257 gnSeqI extra_intron_start, cur_intron_len, max_intron_chunk, cur_intron_chunk; 00258 boolean complement = false; 00259 if(mem_list[memI].in_start > 0){ 00260 extra_intron_start = mem_list[memI].in_start - intron_map_list[intronmapI].GetStart(); 00261 cur_intron_len = intron_map_list[intronmapI].GetEnd() - intron_map_list[intronmapI].GetStart() + 1; 00262 max_intron_chunk = cur_intron_len - extra_intron_start; 00263 cur_intron_chunk = max_intron_chunk < mem_list[memI].length ? max_intron_chunk : mem_list[memI].length; 00264 }else{ 00265 //reverse complement, start at the end. 00266 if(cur_in_start >= intron_map_list[intron_mapEnd].GetStart()) 00267 cur_intron_chunk = cur_match_len; 00268 else 00269 cur_intron_chunk = cur_in_start + cur_match_len - intron_map_list[intron_mapEnd].GetStart(); 00270 complement = true; 00271 seq_file.getIntersectingFeatures(intron_list[intronmapI], in_feat_list, in_feat_index); 00272 } 00273 00274 //the current chunk will be the smaller of the two mappings 00275 gnSeqI cur_chunk = cur_intron_chunk < cur_exon_chunk ? cur_intron_chunk : cur_exon_chunk; 00276 00277 gnLocation cur_exon_loc(exon_list[exonmapI].GetStart() + extra_exon_start, exon_list[exonmapI].GetStart() + cur_chunk); 00278 seq_file.getIntersectingFeatures(cur_exon_loc, ex_feat_list, ex_feat_index); 00279 if(mem_list[memI].in_start > 0){ 00280 gnLocation cur_intron_loc(intron_list[intronmapI].GetStart() - 1, intron_list[intronmapI].GetEnd() + 1); 00281 seq_file.getIntersectingFeatures(cur_intron_loc, in_feat_list, in_feat_index); 00282 }else{ 00283 gnLocation cur_intron_loc(intron_list[intron_mapEnd].GetStart() - 1, intron_list[intron_mapEnd].GetEnd() + 1); 00284 seq_file.getIntersectingFeatures(cur_intron_loc, in_feat_list, in_feat_index); 00285 } 00286 vector<gnBaseFeature*> ex_forward; 00287 vector<gnBaseFeature*> ex_reverse; 00288 vector<gnBaseFeature*> in_forward; 00289 vector<gnBaseFeature*> in_reverse; 00290 00291 for(uint32 featI = 0; featI < ex_feat_list.size(); featI++){ 00292 string featName = ex_feat_list[featI]->GetName(); 00293 if(featName == "mRNA" || featName == "CDS" || featName == "gene" ) 00294 if(ex_feat_list[featI]->GetLocationType() == gnLocation::LT_Complement) 00295 ex_reverse.push_back(ex_feat_list[featI]); 00296 else 00297 ex_forward.push_back(ex_feat_list[featI]); 00298 } 00299 for(uint32 featI = 0; featI < in_feat_list.size(); featI++){ 00300 string featName = in_feat_list[featI]->GetName(); 00301 if(featName == "mRNA" || featName == "CDS" || featName == "gene" ) 00302 if(in_feat_list[featI]->GetLocationType() == gnLocation::LT_Complement) 00303 in_reverse.push_back(in_feat_list[featI]); 00304 else 00305 in_forward.push_back(in_feat_list[featI]); 00306 } 00307 00308 if(complement){ 00309 vector<gnBaseFeature*> tmp_in = in_forward; 00310 in_forward = in_reverse; 00311 in_reverse = tmp_in; 00312 } 00313 00314 //now print out all the complementary features 00315 if((ex_forward.size() > 0 && in_reverse.size() > 0) || (in_forward.size() > 0 && ex_reverse.size() > 0)){ 00316 net_file << "================================\n"; 00317 net_file << "Mem: " << mem_list[memI].length << "\n"; 00318 net_file << "This exon/intron matching size: " << cur_chunk << "\n"; 00319 } 00320 if(ex_forward.size() > 0 && in_reverse.size() > 0){ 00321 net_file << "Forward Exons:\n"; 00322 for(uint32 featI = 0; featI < ex_forward.size(); featI++) 00323 print_feature(net_file, ex_forward[featI]); 00324 net_file << "Matching introns:\n"; 00325 for(uint32 featI = 0; featI < in_reverse.size(); featI++) 00326 print_feature(net_file, in_reverse[featI]); 00327 } 00328 if(in_forward.size() > 0 && ex_reverse.size() > 0){ 00329 net_file << "Reverse Exons:\n"; 00330 for(uint32 featI = 0; featI < ex_reverse.size(); featI++) 00331 print_feature(net_file, ex_reverse[featI]); 00332 net_file << "Matching introns:\n"; 00333 for(uint32 featI = 0; featI < in_forward.size(); featI++) 00334 print_feature(net_file, in_forward[featI]); 00335 } 00336 00337 //release memory 00338 for(uint32 featI = 0; featI < ex_feat_list.size(); featI++) 00339 delete ex_feat_list[featI]; 00340 for(uint32 featI = 0; featI < in_feat_list.size(); featI++) 00341 delete in_feat_list[featI]; 00342 00343 //loop while there is stuff to match 00344 // while(cur_match_len > 0){ 00345 00346 // } 00347 } 00348 } 00349

Generated on Mon Feb 14 19:28:19 2005 for libGenome by doxygen 1.3.8