Hi all... I have been trying to read a huge file of size around 7 GB.

It looks like impossible task now..

The file looks like this:

CITY0_084989053<tab>hdewhiuewf
CITY1_000989090<tab>AACGTACGT
CITY1_000989090<tab>GTACGATAH
CITY2_643274032<tab>kdijadadsail
CITY3_004498906<tab>Adjbsajdada
CITY3_004498906<tab>Adjbsajdada
......

I expect an output like below with only duplicate left side records.

CITY1_000989090#1<tab>AACGTACGT
CITY1_000989090#2<tab>GTACGATAH
CITY3_004498906#1<tab>Adjbsajdada
CITY3_004498906#2<tab>Adjbsajdada
......

The steps I follow is like this:
1) First I awk the left side ID's into a file and then find the uniq of those and create a file of uniq headers.
2) Read this file and compare the duplicate headers and assign #1 / #2...

Here is my initial code

#include<iostream>
#include<string.h>
#include<fstream>
#include<stdlib.h>
#include<set>
#include<map>
using namespace std;


inline int GetIntVal(string strConvert) 
{ 
              int intReturn;
              intReturn = atoi(strConvert.c_str());
              return(intReturn);
}


int main(int argc ,char* argv[])
{
	set<string> myset;
	set<string>::iterator it;
	
	string L1_file = argv[1];

	const char* inputfile1 = L1_file.c_str();
	FILE *in;
	char line[3000];
	char *token;
	in = fopen(inputfile1,"rt+");
	if( in == NULL) exit(1);
	
    string llocations;	
	string chr_base;
	

while (! feof(in)) {
	fgets(line,3000,in);
	
	if (! feof(in)) {
		int count =0;
		//token = strtok(line, "\t\n");

		
		//cout << line;
        myset.insert(line);
	}

}	// End of 1st File Reading While loop
	
		////////////// Reading second file

	string start, end, chr, A1, A2,A3,A4,A5,A6,A7;
	int position;
	
	string feature_file = argv[2];
	
	const char* inputfile2 = feature_file.c_str();
	FILE *inn;
	char linee[3000];
	char *tokenn;
	inn = fopen(inputfile2,"rt+");
	if( inn == NULL) exit(1);
	
	int count = 1;
	while (! feof(inn)) {
		fgets(linee,3000,inn);
		
		if (! feof(inn)) {
			int count2 =0;
			tokenn = strtok(linee, "\t\n");
			
			while (tokenn != NULL) {
				if (++count2) {
					if (count2==1) {
						chr.assign(tokenn);
						//cout << tokenn << endl;
					}
					if (count2==2) {
						start.assign(tokenn);
						//cout << tokenn << endl;
					}					
				}
				tokenn = strtok(NULL, "\t\n");
			}
			string chr2 = chr + "\n";
			for (it=myset.begin(); it!=myset.end(); it++){
				//cout << *it;
				//cout << start << "\t" << end << "\t" << position << endl;
				if (chr2.compare(*it)==0) {
					
					//cout << *it << endl;
					cout << chr << "/" << "#" << count << "\t" <<  start  << endl;    
					count++;
					
					if(count ==3){
						count =1;
					}
				}

			}
			
		}
		
		
	}	// End of 1st File Reading While loop

	
	return 0;
}

I have never used seek function .. Maybe that can solve the problem but I have no idea howto use it in this scenario.

Thanks in adavance !

Edited 4 Years Ago by DNA_Monk: n/a

After you pull all of the unique left side ID's, how many are there? And are they all 15 characters? How much memory do you have on your machine?

Edited 4 Years Ago by histrungalot: n/a

After you pull all of the unique left side ID's, how many are there? And are they all 15 characters? How much memory do you have on your machine?

The left side unique ID file is 1.15 GB and the length is 15-17 characters. I have a total of 4GB RAM.
Maybe I can make the file fixed length format.. because I read that we can usse seekg() function if the file is fixed length

Thanks in advance

I found the solution myself... Just a one line Shell command line

awk '!A[$1]++ { print $1"#1""\t"$2 }' file.txt > 1.txt
awk 'A[$1]++ { print $1"#2""\t"$2 }' file.txt > 2.txt

Then just merging the 2 files using a cat command :) and thats it !
So simple :P Shell programming is really cool ;)

Thanks for all your help and suggestions !

Edited 4 Years Ago by DNA_Monk: n/a

Should work.

#include <map>
#include <iostream>
#include <string>
#include <fstream>
#include <cstring>
#include <cerrno>
using namespace std;

int main(int argc, char **argv){
   if ( argc != 3 ) {
       cout << "Usage: " << argv[0] << " <infile> <outfile>" << endl;
       return -1;
   }
   ifstream inStrm(argv[1]);
   if ( !inStrm ) {
       cout << "Error: Unable to open file " << argv[1] << " -> " << strerror(errno) << endl;
       return -1;
   }
   unsigned long long lineNum(0);
   map<string,int>   uniqMap;
   string             line;
   size_t             pos;
   bool               err(false);

   try {
      while (getline(inStrm,line)){
          if ( (pos = line.find('\t')) != string::npos ){
             line = line.substr(0,pos);
             uniqMap[line]++;
          } else {
             cout << "Error:  No '\\t' found at line: " << lineNum++ << "\n\t" << line << endl;
             err = true;
             break;
          }
      }
   } catch (bad_alloc &e) {
      // Needed to much memory!!
      cout << "Error: Out of memory at line: " << lineNum << endl;
      cout << "Going to have to use files!" << endl;
      err = true;
   }
   if ( !err ) {
      ofstream outStrm(argv[2]);
      if ( outStrm ) {
         // Remove non-dup values
         map<string,int>::iterator posMap(uniqMap.begin());
         for (;posMap!=uniqMap.end();) {
            if (posMap->second == 1 ) {
                uniqMap.erase(posMap++);
            } else {
                posMap->second = 1;
                posMap++;
            }
         }
         inStrm.clear();
         // Rewind to beginning of file
         inStrm.seekg (0, ios::beg);
         string key;
         while (getline(inStrm,line)){
           pos = line.find('\t');
           key = line.substr(0,pos);
           if ( (posMap = uniqMap.find(key)) != uniqMap.end()){
               outStrm << key << "#" << posMap->second++ << "\t" << line.substr(pos) << endl;
           }
        }
        outStrm.close();
      } else {
        cout << "Error: Unable to open file " << argv[2] << " -> " << strerror(errno) << endl;   
        inStrm.close();
        err = true;
      }
   }
   inStrm.close();
   return 0;
}
This article has been dead for over six months. Start a new discussion instead.