0

Dear all

I have two text files. file1 with 42000 rows and 6 columns and file2 with 18 rows and 1 column. I want to match the entries in file2 with file1 either in column0 or in column 3. if the entry exist in any column. Write that line to a file.

I have following code - but its printing just one match and not all the occurrences of that entry.

f1 = open('file1.txt')
f2 = open('file2.txt')
f3 = open('file_out.txt', 'w')

d3 = []
d4 = []

testdict = {}

d5 = []

for line in f1:    
    r1 = line.split()
    testdict[r1[0]] = r1[1:] 
        
for line in f2:   
    r2 = line.split()
    d3.append(r2[0])
    

for k,v in testdict.iteritems():
    if k in d3:
        print k, '\t', d3, '\n'
        f3.write("%s\t%s\n"%(k,v))
Attachments
212354_at (cc) 212344_at = 47.15128 	0.180159873659
212353_at (cc) 207981_s_at = 0.5296896 	0.0020238858679
212353_at (cc) 212344_at = 49.85142 	0.190476812698
212353_at (cc) 212354_at = 26.9219 	0.102865629559
227140_at (cc) 212344_at = 3.871491 	0.0147925427988
227140_at (cc) 212354_at = 6.024117 	0.0230174908446
227140_at (cc) 212353_at = 21.91211 	0.0837237709775
217428_s_at (cc) 212344_at = 1.580161 	0.00603762196912
217428_s_at (cc) 212354_at = 2.933379 	0.0112081196518
217428_s_at (cc) 212353_at = 6.177036 	0.0236017775856
217428_s_at (cc) 227140_at = 1.552292 	0.00593113757421
210511_s_at (cc) 222830_at = 0.504421 	0.00192733731642
210511_s_at (cc) 212344_at = 3.555406 	0.0135848166526
210511_s_at (cc) 212354_at = 5.439621 	0.0207841956819
210511_s_at (cc) 212353_at = 3.339755 	0.0127608378142
210511_s_at (cc) 227140_at = 28.52045 	0.108973513928
210511_s_at (cc) 217428_s_at = 3.695462e-08 	9.46818936835e-11
200887_s_at (cc) 222830_at = 6.718117 	0.0256691887911
200887_s_at (cc) 227140_at = 0.3809849 	0.00145570150533
200887_s_at (cc) 210511_s_at = 3.066229 	0.0117157249431
227566_at (cc) 212344_at = 0.4296921 	0.00164180637884
227566_at (cc) 212354_at = 1.261670 	0.00482070276065
227566_at (cc) 212353_at = 3.482865 	0.0133076454412
227566_at (cc) 227140_at = 6.628637 	0.025327295517
227566_at (cc) 217428_s_at = 6.186533 	0.0236380645819
227566_at (cc) 210511_s_at = 3.695462e-08 	9.46818936835e-11
227566_at (cc) 200887_s_at = 6.025413 	0.0230224427186
203298_s_at (cc) 222830_at = 1.549221 	0.00591940361975
203298_s_at (cc) 212344_at = 5.781745 	0.0220914139935
203298_s_at (cc) 212354_at = 23.47947 	0.089712481775
203298_s_at (cc) 212353_at = 2.907554 	0.0111094451569
203298_s_at (cc) 227140_at = 1.342155 	0.00512822712552
203298_s_at (cc) 210511_s_at = 1.106672 	0.00422847238844
203298_s_at (cc) 200887_s_at = 1.010281 	0.00386017294065
203298_s_at (cc) 227566_at = 1.169249 	0.0044675722478
205941_s_at (cc) 212344_at = 0.008097274 	3.09387500791e-05
205941_s_at (cc) 212354_at = 0.2545379 	0.000972561375296
205941_s_at (cc) 212353_at = 9.852396 	0.0376449253734
205941_s_at (cc) 227140_at = 10.20878 	0.0390066295823
205941_s_at (cc) 217428_s_at = 58.66162 	0.224139621413
205941_s_at (cc) 210511_s_at = 5.816012 	0.0222223444452
205941_s_at (cc) 200887_s_at = 1.273487 	0.00486585422266
205941_s_at (cc) 227566_at = 9.933372 	0.0379543257954
219773_at (cc) 212344_at = 0.008097274 	3.09387500791e-05
219773_at (cc) 227140_at = 3.244916 	0.0123984683883
219773_at (cc) 217428_s_at = 5.479653 	0.0209371535669
219773_at (cc) 210511_s_at = 2.860411 	0.0109293169202
219773_at (cc) 200887_s_at = 5.297075 	0.0202395430371
219773_at (cc) 205941_s_at = 2.861078 	0.0109318654541
212464_s_at (cc) 222830_at = 1.639203 	0.00626321498088
212464_s_at (cc) 212344_at = 9.792414 	0.0374157407246
212464_s_at (cc) 212354_at = 18.00545 	0.0687968512373
212464_s_at (cc) 212353_at = 3.120775 	0.0119241392316
212464_s_at (cc) 227140_at = 1.445088 	0.00552152283839
212464_s_at (cc) 210511_s_at = 2.924278 	0.0111733457282
212464_s_at (cc) 227566_at = 1.286096 	0.00491403182987
212464_s_at (cc) 203298_s_at = 93.39799 	0.356863484523
212464_s_at (cc) 219773_at = 0.1857639 	0.000709783458044
216442_x_at (cc) 222830_at = 3.499723 	0.0133720580118
216442_x_at (cc) 212344_at = 3.225008 	0.012322402102
216442_x_at (cc) 212354_at = 17.54906 	0.0670530350618
216442_x_at (cc) 212353_at = 2.985588 	0.0114076045194
216442_x_at (cc) 227140_at = 0.3550059 	0.00135643859329
216442_x_at (cc) 210511_s_at = 1.145902 	0.00437836591933
216442_x_at (cc) 200887_s_at = 1.053108 	0.00402381021437
216442_x_at (cc) 227566_at = 2.949963 	0.0112714852984
216442_x_at (cc) 203298_s_at = 188.2296 	0.719204674006
216442_x_at (cc) 212464_s_at = 95.32533 	0.364227639451
204619_s_at (cc) 212344_at = 6.30888 	0.0241055390612
204619_s_at (cc) 212354_at = 8.7307 	0.0333590478808
204619_s_at (cc) 212353_at = 3.258694 	0.0124511126165
204619_s_at (cc) 217428_s_at = 0.2897871 	0.00110724470564
204619_s_at (cc) 210511_s_at = 3.071035 	0.0117340881425
204619_s_at (cc) 200887_s_at = 1.224860 	0.00468005578455
204619_s_at (cc) 227566_at = 5.988842 	0.0228827089353
204619_s_at (cc) 203298_s_at = 1.039020 	0.00396998151023
204619_s_at (cc) 205941_s_at = 0.295532 	0.00112919533897
204619_s_at (cc) 212464_s_at = 13.04338 	0.0498373255465
204619_s_at (cc) 216442_x_at = 5.092813 	0.0194590803196
211719_x_at (cc) 222830_at = 9.770733 	0.0373328999996
211719_x_at (cc) 212344_at = 13.75548 	0.0525581816095
211719_x_at (cc) 212354_at = 39.86055 	0.152302793299
211719_x_at (cc) 212353_at = 8.689453 	0.033201447614
211719_x_at (cc) 227140_at = 5.597002 	0.0213855312359
211719_x_at (cc) 217428_s_at = 0.2174616 	0.000830896896083
211719_x_at (cc) 210511_s_at = 0.2370171 	0.000905616318551
211719_x_at (cc) 200887_s_at = 4.867567 	0.0185984400376
211719_x_at (cc) 227566_at = 5.317515 	0.0203176420371
211719_x_at (cc) 203298_s_at = 129.5986 	0.495182048211
211719_x_at (cc) 219773_at = 0.1130696 	0.000432026504106
211719_x_at (cc) 212464_s_at = 61.86106 	0.236364331035
211719_x_at (cc) 216442_x_at = 91.39104 	0.349195148509
211719_x_at (cc) 204619_s_at = 4.822223 	0.0184251855827
203083_at (cc) 212344_at = 0.3890002 	0.00148632708825
203083_at (cc) 212354_at = 1.074872 	0.00410696807331
203083_at (cc) 212353_at = 0.3217873 	0.00122951396303
203083_at (cc) 227140_at = 14.76158 	0.0564023794538
203083_at (cc) 217428_s_at = 0.2617804 	0.00100023417411
203083_at (cc) 210511_s_at = 13.63344 	0.0520918801435
203083_at (cc) 200887_s_at = 13.65134 	0.0521602740819
203083_at (cc) 227566_at = 0.2816289 	0.00107607311747
203083_at (cc) 203298_s_at = 0.9647186 	0.00368608400331
203083_at (cc) 205941_s_at = 2.987681 	0.011415601643
203083_at (cc) 212464_s_at = 2.740017 	0.0104693046398
203083_at (cc) 204619_s_at = 1.187559 	0.00453753275126
203083_at (cc) 211719_x_at = 0.9181697 	0.00350822575757
204620_s_at (cc) 222830_at = 0.4592894 	0.00175489441866
204620_s_at (cc) 212344_at = 1.425543 	0.00544684353527
204620_s_at (cc) 212354_at = 2.554699 	0.00976122487027
204620_s_at (cc) 212353_at = 0.3024386 	0.00115558470069
204620_s_at (cc) 227140_at = 0.3578312 	0.00136723375498
204620_s_at (cc) 210511_s_at = 0.2556141 	0.000976673417558
204620_s_at (cc) 200887_s_at = 8.701064 	0.0332458119726
204620_s_at (cc) 203298_s_at = 0.8622305 	0.00329448820459
204620_s_at (cc) 205941_s_at = 0.2445681 	0.000934467862058
204620_s_at (cc) 219773_at = 8.183067 	0.0312666022014
204620_s_at (cc) 212464_s_at = 0.1854906 	0.000708739208694
204620_s_at (cc) 216442_x_at = 0.1408216 	0.000538063854843
204620_s_at (cc) 204619_s_at = 25.16918 	0.0961686785145
204620_s_at (cc) 211719_x_at = 2.221988 	0.00848997260012
204620_s_at (cc) 203083_at = 5.095876 	0.0194707837069
226997_at (cc) 222830_at = 0.02406085 	9.19338245721e-05
226997_at (cc) 212344_at = 6.906919 	0.0263905805428
226997_at (cc) 212354_at = 14.43690 	0.0551618127546
226997_at (cc) 212353_at = 3.676880 	0.0140489554946
226997_at (cc) 227140_at = 1.742030 	0.00665610567936
226997_at (cc) 217428_s_at = 0.008775296 	3.35293978386e-05
226997_at (cc) 210511_s_at = 14.97636 	0.0572230302967
226997_at (cc) 200887_s_at = 0.3921534 	0.0014983751199
226997_at (cc) 227566_at = 1.612585 	0.00616151051987
226997_at (cc) 203298_s_at = 19.36977 	0.0740097684454
226997_at (cc) 205941_s_at = 0.4179076 	0.00159677909577
226997_at (cc) 219773_at = 3.247681 	0.0124090331503
226997_at (cc) 212464_s_at = 9.650254 	0.0368725629435
226997_at (cc) 216442_x_at = 14.0013 	0.0534974328907
226997_at (cc) 204619_s_at = 14.79516 	0.0565306849539
226997_at (cc) 211719_x_at = 19.13292 	0.0731047905511
226997_at (cc) 204620_s_at = 3.228894 	0.0123372500822
211571_s_at (cc) 212344_at = 9.414277 	0.0359709206866
211571_s_at (cc) 212354_at = 2.379277 	0.00909095663227
211571_s_at (cc) 212353_at = 1.206883 	0.0046113676376
211571_s_at (cc) 227140_at = 0.3457477 	0.00132106402567
211571_s_at (cc) 210511_s_at = 3.695462e-08 	9.46818936835e-11
211571_s_at (cc) 200887_s_at = 2.574950 	0.00983860172202
211571_s_at (cc) 227566_at = 0.2423534 	0.000926005736061
211571_s_at (cc) 203298_s_at = 0.0861718 	0.000329252957959
211571_s_at (cc) 205941_s_at = 0.2265213 	0.000865513016956
211571_s_at (cc) 216442_x_at = 0.1064381 	0.000406688269333
211571_s_at (cc) 204619_s_at = 1.018735 	0.0038924747482
211571_s_at (cc) 211719_x_at = 0.06466542 	0.000247079436802
211571_s_at (cc) 203083_at = 4.846682 	0.0185186407414
211571_s_at (cc) 204620_s_at = 31.39671 	0.119963388187
211571_s_at (cc) 226997_at = 3.091961 	0.0118140440953
225681_at (cc) 222830_at = 0.02240355 	8.56014628909e-05
225681_at (cc) 212344_at = 1.551266 	0.00592721734064
225681_at (cc) 212354_at = 2.884892 	0.0110228561384
225681_at (cc) 227140_at = 10.28565 	0.0393003414283
225681_at (cc) 217428_s_at = 3.166819 	0.0121000683098
225681_at (cc) 210511_s_at = 5.835335 	0.0222961755107
225681_at (cc) 200887_s_at = 1.240328 	0.00473915731746
225681_at (cc) 227566_at = 3.291316 	0.0125757577035
225681_at (cc) 203298_s_at = 8.502333 	0.0324864825999
225681_at (cc) 205941_s_at = 0.2896482 	0.00110671398396
225681_at (cc) 219773_at = 1.129602 	0.0043160854056
225681_at (cc) 212464_s_at = 1.185156 	0.0045283511516
225681_at (cc) 216442_x_at = 2.759737 	0.0105446525987
225681_at (cc) 204619_s_at = 1.263847 	0.00482902083908
225681_at (cc) 211719_x_at = 5.00887 	0.0191383433156
225681_at (cc) 203083_at = 13.91252 	0.0531582142399
225681_at (cc) 204620_s_at = 5.381358 	0.0205615791437
225681_at (cc) 226997_at = 10.24782 	0.039155797144
221731_x_at (cc) 212344_at = 0.4063952 	0.00155279147699
221731_x_at (cc) 212354_at = 1.139211 	0.00435280034158
221731_x_at (cc) 212353_at = 1.404098 	0.00536490454037
221731_x_at (cc) 227140_at = 0.006185611 	2.36344952498e-05
221731_x_at (cc) 210511_s_at = 3.695462e-08 	9.46818936835e-11
221731_x_at (cc) 200887_s_at = 1.222233 	0.00467001830544
221731_x_at (cc) 227566_at = 3.227831 	0.0123331884756
221731_x_at (cc) 203298_s_at = 2.596616 	0.0099213851337
221731_x_at (cc) 212464_s_at = 2.833334 	0.01082
207981_s_at	   
224833_at	   
201328_at	   
201329_s_at	   
209604_s_at	   
222830_at	   
230218_at	   
235521_at	   
228904_at	   
204779_s_at	   
206858_s_at	   
208436_s_at	   
206104_at	   
203298_s_at	   
205619_s_at	   
228964_at	   
205063_at	   
202935_s_at
2
Contributors
1
Reply
2
Views
6 Years
Discussion Span
Last Post by griswolf
0

I think you are working too hard. How about this code?

with open('file1.txt') as f1: # 42 K lines
  with open('file2.txt') as f2: # 18 lines
    with open('file3.txt','w') as fout:
      matches = set()
      for line in f2:
        matches.add(line.strip())
      for line in f1:
        c0,c1,c2,rest = line.split(None,3)
        if c0 in matches or c2 in matches:
          fout.write(line)

I get 48 lines matching.

This topic has been dead for over six months. Start a new discussion instead.
Have something to contribute to this discussion? Please be thoughtful, detailed and courteous, and be sure to adhere to our posting rules.