ajay_p5 0 Light Poster

Hi everyone,

I am using this code to parse data from latex files :

#!/Applications/xampp/xamppfiles/bin/perl
use strict;
use File::Copy;
use LaTex::Parser;
use Text::CSV_XS;
use utf8;
use Unicode::String;

Unicode::String->stringify_as( 'utf8' ); # utf8 already is the default

  
 #see Ch 7.1 of Data Munging with Perl

 my $output = "ID+add.txt";

 open (LAT_LON_OUT_FILE, "> $output") or die "Couldn't open $output for writing: $!\n";
 binmode LAT_LON_OUT_FILE, ":utf8";


	    	    
    
#This routine is supposed to work in the dir where the .tar.gz files are
#It extracts each .tar.gz file into a temp directory
my $tdir = 'temp';
mkdir $tdir;

my @flist=<*.tar.gz>;                                   
chdir $tdir;
my $not_processed = 0; #Number of files that the script cannot process
my $number_of_papers = 0;
my $file_cat = " ";

#Go through all the .tar.gz files in current dir
foreach my $item (@flist)
{
    unlink glob "*.*";      #erase all files in dir $tdir                                                                                         
    
    #go through .tar.gz files and uncompress them
    copy "../$item", "$item";          #copy .gz file to temp dir ...
    print"$item\n";
    system "gzip -dnv $item";         #ungzip ...
    my $ftar = substr($item,0,-3);  #remove extension .gz ...
    print"$ftar\n";
    my $tar_error = system "tar -xm -f $ftar";  #and untar    
 
     #untar may not work if archive has only one file. In that case 
     #gzip gave already a tex file -just need to rename it.
     if ($tar_error)                         
     {        
         rename "$ftar", substr($ftar,0,-4).".tex";
     }
     else
     {
         unlink $ftar;       #tar file needs to be deleted, otherwise it will be in @textlist
     }
  
    
    # Let's extract the data of the corresponding ABS file first..

    my $file_ID = substr($ftar,0,-4);

    my $input_file = "../". $file_ID . ".abs";
    open (MYINPUTFILE, "$input_file") or die "Couldn't open $input_file for reading: $!\n"; # open for input
    my $file_content = "FILE: \n";
    while ( <MYINPUTFILE> )
      {
	$file_content = $file_content . $_;
      }
	close(MYINPUTFILE);
	#$file_content =~ s/\n/\t/g;   #to make more clear where are the separators..
        # print "$file_content\n";sleep 5;
        $file_content =~ /\nDate:\s\S+,\s([^\n]+)\s\s\(/;
        my $file_date = $1;
    
        $file_content =~ /\nTitle:\s([^\n]+)/;
        my $file_tit = $1;

        $file_content =~ /\nAuthors:\s([^\n]+)/;
        my $file_auth = $1;

        $file_content =~ /\nCategories:\s([^\n]+)/;
        $file_cat = $1;

        
       

       
    
  
     #process TeX files. Note that some of the files had extension .tex whereas others had .latex or .txt
     #note that there is a problem in looking just for <*.*t*> -you also get .sty files, which are style files!
     my @texlist;
     my @texlist1 = <*.latex>;
     my @texlist2 = <*.tex>;
     my @texlist3 = <*.txt>;
    
     push(@texlist, @texlist1);
     push(@texlist, @texlist2);
     push(@texlist, @texlist3);
    
     my $valid_zip = 0;          #Is zip code valid?
     my @author;                 #Array with authors/addresses
     my $author_index = 0;   #Index into @author
     my $aux1 = 0;               #American-like post code detected?
     my $myflag = 0;
 
  print STDERR "test100......................\n";
   
     #Go through .TeX files in dir $tdir and assign @author
     foreach my $texfile (@texlist)
     {        
         #print LAT_LON_OUT_FILE "$ftar \n";

         my $l = new LaTeX::Parser 'file' => $texfile;
         
         printf("aaaa %s\n", $texfile);


 print STDERR "test100......................\n";
         my $p = $l->latex;
  print STDERR "test100......................\n";

         my $next_field = 0;              #if $next_field=1 then next field is author; $next_field=2 then next field is address
         my $ambiguous_field = 1;     #sometimes the author and address are ambiguous. Detect these.
      
  

         #Go through TeX fields in files and assign array @author
         for (my $i=0; $i<=$#{$p}; $i++)
         {
             
	   my $authorfield = $p->[$i]; #contains LaTeX fields
	   
            
             #Some author/address fields are empty others don't have a LateX field '{' -skip them
             next if (($next_field == 1 || $next_field == 2) && (!($authorfield =~ /\w/) || !($authorfield =~ /\{/)));
            
             #Assign author field]
             if ($next_field == 1 || ($authorfield =~ /\\large/i) || ($authorfield =~ /\\small/i)) 
             {
                 $author[$author_index] = "author: ".$authorfield;
                 $next_field = 0;
                 $author_index++;
                 $ambiguous_field = 0;   #We have detected an author field, so file is unambiguous
             }
            
             #Assign address field
             if ($next_field == 2)
             {
                 $author[$author_index] = "address: ".$authorfield;
		 $next_field = 0;
                 $author_index++;
                 $ambiguous_field = 0;   #We have detected an address field, so file is unambiguous
             }
            
             #next field to be read will be the author
             $next_field = 1 if (($authorfield =~ /\\aut(?:\w)+/) || ($authorfield =~ /\\large/i) || ($authorfield =~ /\\center/i) || ($authorfield =~ /\\small/i));
            
             #next field to be read will be the address
             $next_field = 2 if ($authorfield =~ /\\add(?:\w)+/) || ($authorfield =~/\\affi(?:\w)+/) || ($authorfield =~ /\\inst(?:\w)*/) || (($authorfield =~/\{\d\}/) and ($myflag == 1));

	     #two fields after will be an address
	     if ($authorfield =~/\\altaffiltext\s?/) { $myflag = 1;}
	     #if (($authorfield =~/\{\d\}/) and ($myflag == 1)){ $myflag = 2;}
	     if (($authorfield =~/\{([^=]+,+[^=]+)\}/) and ($myflag == 1)) {$myflag = 0;}
            
             #we are only interested in the header of the paper, so if you find abstract
             #and you have found author/address then end loop               
             last if ($authorfield  =~ /^\{abst(?:\w)+/ && !$ambiguous_field);            
         }
        
         #Skip files that do not have author/address
         next if ($ambiguous_field);    


      

         #Check zip codes
         my $author_only = 0;
         $aux1 = 0; my $aux2 = 0;
         $valid_zip = 0;
         foreach my $adr (@author)
         {        
           
	
	     #Look for zip code if field is address or if there is no address field in the @author array
             #(in which case the address will be in the author field)            
             if ($adr =~ /address/ || !(grep {/address/} @author))
             {
                     
                      
                      print STDERR "test5......................\n";
		      #$adr =~ /address: \{([^=]+,+[^=]+)\}/;
	              $adr =~ /address: \{([^=]+,+[^=]+)\}/;
                      print STDERR "test6......................\n";
		      if($adr =~ /address: \{([^=]+,+[^=]+)\}/)  {
				 
				$adr = $1;
				 print STDERR "test6......................\n";
				 $adr =~ s/\n/ /g;
				 #$adr =~ s/\\/ /g;  
				 $adr =~ s/{//g;
				 $adr =~ s/}//g;
				 $adr =~ s/\.$//g;
				 $adr =~ s/\s+/ /g;
				 $adr =~ s/\\+\s*$//g;
				     
				 
				 print STDERR "test4......................\n";
				 #Let's check multiple adresses in a row..
				 #if ( $adr =~ /\$\^\S+\$/){
				 # These commands transform numbered separators into \instA. Further versions of the code could extract author-address relationship
				   $adr =~ s/\$\^?\S+\s?\S*\$/\\instA/g;
				   $adr =~ s/\\+\s?\d\./\\instA/g;
				   $adr =~ s/\\+\s?\d\-/\\instA/g;
				   $adr =~ s/\\+\s?\S\)/\\instA/g;
				   $adr =~ s/\\+\s?\d\)/\\instA/g;
				   
				   print STDERR "test3......................\n";
				 #These ones transform different usages of \and into \instA
				   $adr =~ s/\\+\s?and\s?/\\instA/g;
				   $adr =~ s/,?\sand\s?\\+/\\instA/g;
				   $adr =~ s/\\newline/\\instA/g;
				   $adr =~ s/;\s\\+/\\instA/g;
				   $adr =~ s/,\\+/\\instA/g;
				   $adr =~ s/\\medskip/\\instA/g;
				   
				
				 if ( $adr=~/(\\inst)/ ){  # Separates multiple addresses linked by a \inst
				   my $pos = rindex($adr, "\\inst");
				   while ($pos > 0){
				     my $resul = substr($adr,$pos+6);
				     $resul =~ s/,?\s*\\\*s*\\\*s*\\*$//g;
				     $resul =~ s/^\\,\s//g;
				     $resul =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
				     $resul =~ s/[^\s]+\@[^\=]+//g;
				     $resul =~ s/\\S+$//g;
				     $resul =~ s/\\\s\\[^=]+$//g;
				     $resul =~ s/\\+\s?\[\\affilskip\]\s*$//g;
				     $resul =~ s/\,\s?$//g;
				     $resul =~ s/\.$//g;
				     $resul =~ s/\;$//g;
				     $resul =~ s/\;\s+\S+\.\S+\.?\S*$//g;
				     #$resul =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
				     $resul =~ s/\\+\s*$//g;
				     if (length($resul) > 10) {print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$resul\n";}
				     $adr = substr($adr,0,$pos); 
				     $pos = rindex($adr, "\\inst", $pos+6);
				      if (($pos == -1) and (length($adr) > 10)) {  #there is no \inst at the beginning. Print the remaining of the string and leave. 
 				     $adr =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
				     $adr =~ s/[^\s]+\@[^\=]+//g;
				     $adr =~ s/\\+\s?\[\\affilskip\]\s*$//g;
				     $adr =~ s/\\S+$//g;
				     $adr =~ s/\\\s\\[^=]+$//g;
				     $adr =~ s/\,\s?$//g;
				     $adr =~ s/\.$//g;
				     $adr =~ s/\;$//g;
				     $adr =~ s/\;\s+\S+\.\S+\.?\S*$//g;
				     #$adr =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
				     $adr =~ s/\\+\s*$//g;
				     print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$adr\n";
 				     }
				   }
				 }


				 else {
				   $adr =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
				   $adr =~ s/[^\s]+\@[^\=]+//g;
				     $adr =~ s/\\S+$//g;
				     $adr =~ s/\\\s\\[^\=]+$//g;
				     $adr =~ s/\\+\s?\[\\affilskip\]\s*$//g;
				     $adr =~ s/\,\s?$//g;
				     $adr =~ s/\.$//g;
				     $adr =~ s/\;\s+\S+\.\S+\.?\S*$//g;
				     #$adr =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
				     $adr =~ s/\\+\s*$//g;
				     print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$adr\n";
				  
				 }
		       	}
					      
	      }
         }
    
     }
                    
    
 }

close LAT_LON_OUT_FILE;

But I am getting an error...like this:
A \ Command I don't understand at /Applications/XAMPP/xamppfiles/lib/perl5/site_perl/5.10.0/LaTex/Parser.pm line 115.

Can sumbody sugest what could be the problem and the solution to it,
I am using XAMPP server, perl 5.10.1 and LaTex:: Parser module to do this. Thanks

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.