Hi everyone,
I would like to parse this text file

PREVIT:nmrValidate_166w.pdb
 atoms "A   -750 -GLU -CG  " and "B   -54  -GLN -CG  "            3.8759 A apart
 atoms "A   -750 -GLU -CD  " and "B   -54  -GLN -CG  "            3.8447 A apart
 atoms "A   -750 -GLU -C   " and "B   -51  -LYS -CE  "            3.8431 A apart
 atoms "A   -751 -PRO -CB  " and "B   -52  -ARG -CB  "            3.7116 A apart
 atoms "A   -751 -PRO -CB  " and "B   -52  -ARG -CD  "            3.3998 A apart
 atoms "A   -751 -PRO -CG  " and "B   -52  -ARG -CB  "            3.4295 A apart
 atoms "A   -751 -PRO -C   " and "B   -51  -LYS -CE  "            3.5360 A apart
 atoms "A   -752 -VAL -CA  " and "B   -51  -LYS -CB  "            3.8923 A apart
 atoms "A   -753 -LYS -CG  " and "B   -52  -ARG -CG  "            3.3236 A apart
 atoms "A   -753 -LYS -CG  " and "B   -52  -ARG -CD  "            3.8331 A apart
 atoms "A   -758 -PRO -CD  " and "B   -48  -GLY -CA  "            3.6687 A apart
 atoms "A   -809 -TYR -CE2 " and "B   -49  -ARG -CG  "            3.8926 A apart
 atoms "A   -809 -TYR -CE2 " and "B   -49  -ARG -CD  "            3.7719 A apart
 atoms "A   -809 -TYR -CE2 " and "B   -49  -ARG -CZ  "            3.7362 A apart
PREVIT:nmrValidate_192w.pdb
 atoms "A   -748 -PHE -CA  " and "B   -50  -ALY -CD  "            3.4090 A apart
 atoms "A   -748 -PHE -CA  " and "B   -50  -ALY -CE  "            3.8186 A apart
 atoms "A   -748 -PHE -CD1 " and "B   -50  -ALY -CG  "            3.6835 A apart
 atoms "A   -748 -PHE -C   " and "B   -50  -ALY -CD  "            3.8035 A apart
 atoms "A   -751 -PRO -CD  " and "B   -52  -ARG -CB  "            3.8566 A apart
 atoms "A   -809 -TYR -CZ  " and "B   -48  -GLY -CA  "            3.5344 A apart
PREVIT:nmrValidate_251w.pdb
 atoms "A   -748 -PHE -CA  " and "B   -50  -ALY -CZ  "            3.8672 A apart
 atoms "A   -748 -PHE -CD1 " and "B   -50  -ALY -CZ  "            3.6054 A apart
 atoms "A   -748 -PHE -CD1 " and "B   -50  -ALY -CM  "            3.8393 A apart
 atoms "A   -750 -GLU -CB  " and "B   -51  -LYS -CA  "            3.7115 A apart

to extract every group which has line begins with PREVIT, then count the line has "-50 -ALY" and choose the group which have the most "-50 -ALY"

This is my code, it doesn't work for my purpose. COuld you please correct it

#!/usr/bin/perl -w
use strict;
use FileHandle;

sub extractALY($)
{
        my $count = 0;
        my $fileHandle = $_[0];
        while (<$fileHandle>)
        {
           my $line = $_;
           chomp($line);
           $line_number++;
           if ($line =~ m/^[PREVIT]/)
           {
                next;
                print "$line\n";
                if ($line =~ m/[-50 -ALY]/)
                {
                        $count++;
                        push (@array,$line);
                }
           }
        }
}

my $fh = new FileHandle;
$fh->open("<test.txt") or die "Could not open file\n";
extractALY($fh);
$fh->close(); # automatically closes file

Many thanks

Edited 3 Years Ago by mike_2000_17: Fixed formatting

#!/usr/bin/perl
use strict;
use warnings;

my %group;
my $groupname;

#Change the following line to assign path to your data file
my $path = '/home/david/Programming/Perl/data';
my $filename = "$path/test.txt";
open(my $fh, '<', $filename);

while (<$fh>){
    chomp;
    if (m/^PREVIT/){
        $groupname = $_;
        $group{$groupname} = 0;
        next;
    }else{
        $group{$groupname}++;
    }
}
close $fh;

#The following code adapted from code found at
# http://devdaily.com/perl/edu/qanda/plqa00016/
print "\nGROUP COUNTS IN DESCENDING NUMERIC ORDER:\n";
foreach my $key (sort hashValueDescendingNum (keys(%group))) {
   print "\t$group{$key} \t\t $key\n";
}

#----------------------------------------------------------------------#
#  FUNCTION:  hashValueDescendingNum                                   #
#                                                                      #
#  PURPOSE:   Help sort a hash by the hash 'value', not the 'key'.     #
#             Values are returned in descending numeric order          #
#             (highest to lowest).                                     #
#----------------------------------------------------------------------#

sub hashValueDescendingNum {
   $group{$b} <=> $group{$a};
}

This gives the following output:

GROUP COUNTS IN DESCENDING NUMERIC ORDER:
	14 		 PREVIT:nmrValidate_166w.pdb
	6 		 PREVIT:nmrValidate_192w.pdb
	4 		 PREVIT:nmrValidate_251w.pdb

I forgot that you wanted to count only the lines containing "-50 -ALY". That is easily done by adding a condition to the statement that increments the value of the hash item. The following prints only one group that has the most "-50 -ALY" records (if more than one group has the 'most' it prints only one of them.)

#!/usr/bin/perl
use strict;
use warnings;

my %group;
my $groupname;

#Change the following line to assign path to your data file
my $path = '/home/david/Programming/Perl/data';
my $filename = "$path/test.txt";
open(my $fh, '<', $filename);

while (<$fh>){
    chomp;
    if (m/^PREVIT/){
        $groupname = $_;
        $group{$groupname} = 0;
        next;
    }else{
        $group{$groupname}++ if m/-50 -ALY/;
    }
}
close $fh;

my @biggest = ('None', 0);
while(my ($key, $value) = each(%group)) {
    if ($biggest[1] < $group{$key}){
        @biggest = ($key, $value);
    }
}
print "The $biggest[0] group has the most '-50 -ALY' lines ($biggest[1]).\n";
#### Output is
# The PREVIT:nmrValidate_192w.pdb group has the most '-50 -ALY' lines (4).

Dear David Ellis
Thank you so much for your solution ^^.
By the way, if I want to count 5 biggest group of "-50 -ALY", I change this command to:
$biggest[5] < $group{$key}
it does not work, how can I fix it?

Best regards
Quy

Edited 6 Years Ago by becon: n/a

Dear David Ellis
Thank you so much for your solution ^^.
By the way, if I want to count 5 biggest group of "-50 -ALY", I change this command to:
$biggest[5] < $group{$key}
it does not work, how can I fix it?

Best regards
Quy

Sorry, Quy, I'm not sure I understand your latest question. There were only three groups in the data you posted so there is no "5 biggest group". Could you show me what output you want the program to print?
Best regards,
David

Dear David
oh I am sorry not to clear that, my original text file have 1000 PREVIT group, I extracted 3 of them to test the code first. I tried to print every PREVIT group into a separated file and pushed the count '-50 -ALY' to an array, then choose the max first 5 group which have the most '-50 -ALY'. But I was not succeed. Just copy 3 of groups into 6 or whatever...
My output is like that:
GROUP COUNTS IN DESCENDING NUMERIC ORDER:
Number of contact number of ALY group
14 0 PREVIT:nmrValidate_166w.pdb
6 4 PREVIT:nmrValidate_192w.pdb
4 3 PREVIT:nmrValidate_251w.pdb

OK, I duplicated the data and added group names PREVIT:nmrValidate_300w.pdb etc. (See attached input data file test.txt)

#!/usr/bin/perl
use strict;
use warnings;

my %group;
my $groupname;

#Change the following line to assign path to your data file
my $path = '/home/david/Programming/Perl/data';
my $filename = "$path/test.txt";
open(my $fh, '<', $filename);

while (<$fh>){
    chomp;
    if (m/^PREVIT/){
        $groupname = $_;
        $group{$groupname}->{count_all} = 0;
        $group{$groupname}->{count_50_ALY} = 0;
        next;
    }else{
        $group{$groupname}->{count_all}++;
        $group{$groupname}->{count_50_ALY}++ if m/-50 -ALY/;
    }
}
close $fh;

#The following code adapted from code found at
# http://devdaily.com/perl/edu/qanda/plqa00016/
print "\nTHE FIVE GROUPS HAVING MOST -50 -ALY\n";
print "GROUP COUNTS IN DESCENDING NUMERIC ORDER:\n";
print "Number of contact\t\tnumber of ALY group\n";
my $line_count = 1;
foreach my $key (sort {$group{$b}->{count_50_ALY} <=> $group{$a}->{count_50_ALY};} (keys(%group))) {
    last  if $line_count > 5; #Print only the top five groups.
    print "($line_count)\t$group{$key}->{count_all}\t\t\t$group{$key}->{count_50_ALY} \t\t $key\n";
    $line_count++;
}
#Output is:
#THE FIVE GROUPS HAVING MOST -50 -ALY
#GROUP COUNTS IN DESCENDING NUMERIC ORDER:
#Number of contact		number of ALY group
#(1)	6			4 		 PREVIT:nmrValidate_301w.pdb
#(2)	6			4 		 PREVIT:nmrValidate_192w.pdb
#(3)	4			3 		 PREVIT:nmrValidate_251w.pdb
#(4)	4			3 		 PREVIT:nmrValidate_302w.pdb
#(5)	14			0 		 PREVIT:nmrValidate_166w.pdb
Attachments
PREVIT:nmrValidate_166w.pdb
atoms "A -750 -GLU -CG " and "B -54 -GLN -CG " 3.8759 A apart
atoms "A -750 -GLU -CD " and "B -54 -GLN -CG " 3.8447 A apart
atoms "A -750 -GLU -C " and "B -51 -LYS -CE " 3.8431 A apart
atoms "A -751 -PRO -CB " and "B -52 -ARG -CB " 3.7116 A apart
atoms "A -751 -PRO -CB " and "B -52 -ARG -CD " 3.3998 A apart
atoms "A -751 -PRO -CG " and "B -52 -ARG -CB " 3.4295 A apart
atoms "A -751 -PRO -C " and "B -51 -LYS -CE " 3.5360 A apart
atoms "A -752 -VAL -CA " and "B -51 -LYS -CB " 3.8923 A apart
atoms "A -753 -LYS -CG " and "B -52 -ARG -CG " 3.3236 A apart
atoms "A -753 -LYS -CG " and "B -52 -ARG -CD " 3.8331 A apart
atoms "A -758 -PRO -CD " and "B -48 -GLY -CA " 3.6687 A apart
atoms "A -809 -TYR -CE2 " and "B -49 -ARG -CG " 3.8926 A apart
atoms "A -809 -TYR -CE2 " and "B -49 -ARG -CD " 3.7719 A apart
atoms "A -809 -TYR -CE2 " and "B -49 -ARG -CZ " 3.7362 A apart
PREVIT:nmrValidate_192w.pdb
atoms "A -748 -PHE -CA " and "B -50 -ALY -CD " 3.4090 A apart
atoms "A -748 -PHE -CA " and "B -50 -ALY -CE " 3.8186 A apart
atoms "A -748 -PHE -CD1 " and "B -50 -ALY -CG " 3.6835 A apart
atoms "A -748 -PHE -C " and "B -50 -ALY -CD " 3.8035 A apart
atoms "A -751 -PRO -CD " and "B -52 -ARG -CB " 3.8566 A apart
atoms "A -809 -TYR -CZ " and "B -48 -GLY -CA " 3.5344 A apart
PREVIT:nmrValidate_251w.pdb
atoms "A -748 -PHE -CA " and "B -50 -ALY -CZ " 3.8672 A apart
atoms "A -748 -PHE -CD1 " and "B -50 -ALY -CZ " 3.6054 A apart
atoms "A -748 -PHE -CD1 " and "B -50 -ALY -CM " 3.8393 A apart
atoms "A -750 -GLU -CB " and "B -51 -LYS -CA " 3.7115 A apart
PREVIT:nmrValidate_300w.pdb
atoms "A -750 -GLU -CG " and "B -54 -GLN -CG " 3.8759 A apart
atoms "A -750 -GLU -CD " and "B -54 -GLN -CG " 3.8447 A apart
atoms "A -750 -GLU -C " and "B -51 -LYS -CE " 3.8431 A apart
atoms "A -751 -PRO -CB " and "B -52 -ARG -CB " 3.7116 A apart
atoms "A -751 -PRO -CB " and "B -52 -ARG -CD " 3.3998 A apart
atoms "A -751 -PRO -CG " and "B -52 -ARG -CB " 3.4295 A apart
atoms "A -751 -PRO -C " and "B -51 -LYS -CE " 3.5360 A apart
atoms "A -752 -VAL -CA " and "B -51 -LYS -CB " 3.8923 A apart
atoms "A -753 -LYS -CG " and "B -52 -ARG -CG " 3.3236 A apart
atoms "A -753 -LYS -CG " and "B -52 -ARG -CD " 3.8331 A apart
atoms "A -758 -PRO -CD " and "B -48 -GLY -CA " 3.6687 A apart
atoms "A -809 -TYR -CE2 " and "B -49 -ARG -CG " 3.8926 A apart
atoms "A -809 -TYR -CE2 " and "B -49 -ARG -CD " 3.7719 A apart
atoms "A -809 -TYR -CE2 " and "B -49 -ARG -CZ " 3.7362 A apart
PREVIT:nmrValidate_301w.pdb
atoms "A -748 -PHE -CA " and "B -50 -ALY -CD " 3.4090 A apart
atoms "A -748 -PHE -CA " and "B -50 -ALY -CE " 3.8186 A apart
atoms "A -748 -PHE -CD1 " and "B -50 -ALY -CG " 3.6835 A apart
atoms "A -748 -PHE -C " and "B -50 -ALY -CD " 3.8035 A apart
atoms "A -751 -PRO -CD " and "B -52 -ARG -CB " 3.8566 A apart
atoms "A -809 -TYR -CZ " and "B -48 -GLY -CA " 3.5344 A apart
PREVIT:nmrValidate_302w.pdb
atoms "A -748 -PHE -CA " and "B -50 -ALY -CZ " 3.8672 A apart
atoms "A -748 -PHE -CD1 " and "B -50 -ALY -CZ " 3.6054 A apart
atoms "A -748 -PHE -CD1 " and "B -50 -ALY -CM " 3.8393 A apart
atoms "A -750 -GLU -CB " and "B -51 -LYS -CA " 3.7115 A apart

Dear David
The problem is solved. Thank you very much.
Best regards
becon

Edited 6 Years Ago by becon: n/a

This article has been dead for over six months. Start a new discussion instead.