Hundreds of records is not much in today's world so you can read each file into a dictionary and go from there. A simple example to associate the two files because I am too tired to do more today. You can omit some of the unnecessary records from the dictionary or use as is and filter before writing to the third file.
## simulate 2 files read into lists using readlines()
file_1 = ['SU' ,
'PD-98059 PD-98059 Tanimoto from SU = 0.129213',
'BML-265 BML-265 Tanimoto from SU = 0.163743',
'BML-257 BML-257 Tanimoto from SU = 0.156627',
'SU 4312 SU 4312 Tanimoto from SU = 1',
'AG-370 AG-370 Tanimoto from SU = 0.264286',
'AG-490 AG-490 Tanimoto from SU = 0.347826',
'PD-98060 PD-98059 Tanimoto from SU = 0.129213',
'BML-265 BML-265 Tanimoto from SU = 0.163743',
'BML-257 BML-257 Tanimoto from SU = 0.156627',
'SU 4312 SU 4312 Tanimoto from SU = 1',
'AG-370 AG-370 Tanimoto from SU = 0.264286',
'AG-490 AG-490 Tanimoto from SU = 0.347826',
'PD-98061 PD-98060 Tanimoto from SU = 0.129213',
'BML-265 BML-265 Tanimoto from SU = 0.163743',
'BML-257 BML-257 Tanimoto from SU = 0.156627',
'SU 4312 SU 4312 Tanimoto from SU = 1',
'AG-370 AG-370 Tanimoto from SU = 0.264286',
'AG-490 AG-490 Tanimoto from SU = 0.347826']
file_2 = ['GF',
'PD-98059 PD-98059 Tanimoto from GF = 0.118483',
'BML-265 BML-265 Tanimoto from GF = 0.164179',
'BML-257 BML-257 Tanimoto from GF = 0.213904',
'SU 4312 SU 4312 Tanimoto from GF = 0.436364',
'AG-370 AG-370 Tanimoto from GF = 0.284848',
'AG-490 AG-490 Tanimoto from GF = 0.307692',
'PD-98061 PD-98059 Tanimoto from GF = 0.118483',
'BML-265 BML-265 Tanimoto from GF = 0.164179',
'BML-257 BML-257 Tanimoto from GF = 0.213904',
'SU 4312 SU 4312 Tanimoto from GF = 0.436364',
'AG-370 AG-370 Tanimoto from GF = 0.284848',
'AG-490 AG-490 Tanimoto from GF = 0.307692']
def groups(list_in):
""" break the file into groups of records from "PD" to
the next "PD"
"""
return_dict = {}
group_list = []
for rec in list_in:
rec = rec.strip()
if (rec.startswith("PD")) and (len(group_list)): ## new group
dict_in = to_dict(group_list, return_dict)
group_list = []
group_list.append(rec)
## process the final group
dict_in = to_dict(group_list, return_dict)
return return_dict
def to_dict(group_list, dict_in):
""" add to the dictionary
key = "PD"+number
values = list of lists = all records associated with this key
"""
## the first record contains the "PD" key
substrs = group_list[0].split()
key = substrs[0]
if key in dict_in:
print "DUPLICATE record", group_list[0]
else :
dict_in[key] = []
## add all of the records to the dictionary
## including the "PD" record
for rec in group_list:
dict_in[key].append(rec)
return dict_in
ID = file_1[0].strip() ## "SU"
file_1_dict = groups(file_1[1:])
ID += " " + file_2[0].strip() ## "GF"
file_2_dict = groups(file_2[1:])
print "ID =", ID
## not printed in any particular order
for key in file_1_dict:
print key
for rec in file_1_dict[key]:
print " ", rec
if key in file_2_dict:
for rec in file_2_dict[key]:
print " ", rec ## additional indent