| | |
LaTex::parser module problem
![]() |
•
•
Join Date: Apr 2009
Posts: 19
Reputation:
Solved Threads: 0
Hi everyone,
I am using this code to parse data from latex files :
But I am getting an error...like this:
A \ Command I don't understand at /Applications/XAMPP/xamppfiles/lib/perl5/site_perl/5.10.0/LaTex/Parser.pm line 115.
Can sumbody sugest what could be the problem and the solution to it,
I am using XAMPP server, perl 5.10.1 and LaTex:: Parser module to do this. Thanks
I am using this code to parse data from latex files :
Perl Syntax (Toggle Plain Text)
#!/Applications/xampp/xamppfiles/bin/perl use strict; use File::Copy; use LaTex::Parser; use Text::CSV_XS; use utf8; use Unicode::String; Unicode::String->stringify_as( 'utf8' ); # utf8 already is the default #see Ch 7.1 of Data Munging with Perl my $output = "ID+add.txt"; open (LAT_LON_OUT_FILE, "> $output") or die "Couldn't open $output for writing: $!\n"; binmode LAT_LON_OUT_FILE, ":utf8"; #This routine is supposed to work in the dir where the .tar.gz files are #It extracts each .tar.gz file into a temp directory my $tdir = 'temp'; mkdir $tdir; my @flist=<*.tar.gz>; chdir $tdir; my $not_processed = 0; #Number of files that the script cannot process my $number_of_papers = 0; my $file_cat = " "; #Go through all the .tar.gz files in current dir foreach my $item (@flist) { unlink glob "*.*"; #erase all files in dir $tdir #go through .tar.gz files and uncompress them copy "../$item", "$item"; #copy .gz file to temp dir ... print"$item\n"; system "gzip -dnv $item"; #ungzip ... my $ftar = substr($item,0,-3); #remove extension .gz ... print"$ftar\n"; my $tar_error = system "tar -xm -f $ftar"; #and untar #untar may not work if archive has only one file. In that case #gzip gave already a tex file -just need to rename it. if ($tar_error) { rename "$ftar", substr($ftar,0,-4).".tex"; } else { unlink $ftar; #tar file needs to be deleted, otherwise it will be in @textlist } # Let's extract the data of the corresponding ABS file first.. my $file_ID = substr($ftar,0,-4); my $input_file = "../". $file_ID . ".abs"; open (MYINPUTFILE, "$input_file") or die "Couldn't open $input_file for reading: $!\n"; # open for input my $file_content = "FILE: \n"; while ( <MYINPUTFILE> ) { $file_content = $file_content . $_; } close(MYINPUTFILE); #$file_content =~ s/\n/\t/g; #to make more clear where are the separators.. # print "$file_content\n";sleep 5; $file_content =~ /\nDate:\s\S+,\s([^\n]+)\s\s\(/; my $file_date = $1; $file_content =~ /\nTitle:\s([^\n]+)/; my $file_tit = $1; $file_content =~ /\nAuthors:\s([^\n]+)/; my $file_auth = $1; $file_content =~ /\nCategories:\s([^\n]+)/; $file_cat = $1; #process TeX files. Note that some of the files had extension .tex whereas others had .latex or .txt #note that there is a problem in looking just for <*.*t*> -you also get .sty files, which are style files! my @texlist; my @texlist1 = <*.latex>; my @texlist2 = <*.tex>; my @texlist3 = <*.txt>; push(@texlist, @texlist1); push(@texlist, @texlist2); push(@texlist, @texlist3); my $valid_zip = 0; #Is zip code valid? my @author; #Array with authors/addresses my $author_index = 0; #Index into @author my $aux1 = 0; #American-like post code detected? my $myflag = 0; print STDERR "test100......................\n"; #Go through .TeX files in dir $tdir and assign @author foreach my $texfile (@texlist) { #print LAT_LON_OUT_FILE "$ftar \n"; my $l = new LaTeX::Parser 'file' => $texfile; printf("aaaa %s\n", $texfile); print STDERR "test100......................\n"; my $p = $l->latex; print STDERR "test100......................\n"; my $next_field = 0; #if $next_field=1 then next field is author; $next_field=2 then next field is address my $ambiguous_field = 1; #sometimes the author and address are ambiguous. Detect these. #Go through TeX fields in files and assign array @author for (my $i=0; $i<=$#{$p}; $i++) { my $authorfield = $p->[$i]; #contains LaTeX fields #Some author/address fields are empty others don't have a LateX field '{' -skip them next if (($next_field == 1 || $next_field == 2) && (!($authorfield =~ /\w/) || !($authorfield =~ /\{/))); #Assign author field] if ($next_field == 1 || ($authorfield =~ /\\large/i) || ($authorfield =~ /\\small/i)) { $author[$author_index] = "author: ".$authorfield; $next_field = 0; $author_index++; $ambiguous_field = 0; #We have detected an author field, so file is unambiguous } #Assign address field if ($next_field == 2) { $author[$author_index] = "address: ".$authorfield; $next_field = 0; $author_index++; $ambiguous_field = 0; #We have detected an address field, so file is unambiguous } #next field to be read will be the author $next_field = 1 if (($authorfield =~ /\\aut(?:\w)+/) || ($authorfield =~ /\\large/i) || ($authorfield =~ /\\center/i) || ($authorfield =~ /\\small/i)); #next field to be read will be the address $next_field = 2 if ($authorfield =~ /\\add(?:\w)+/) || ($authorfield =~/\\affi(?:\w)+/) || ($authorfield =~ /\\inst(?:\w)*/) || (($authorfield =~/\{\d\}/) and ($myflag == 1)); #two fields after will be an address if ($authorfield =~/\\altaffiltext\s?/) { $myflag = 1;} #if (($authorfield =~/\{\d\}/) and ($myflag == 1)){ $myflag = 2;} if (($authorfield =~/\{([^=]+,+[^=]+)\}/) and ($myflag == 1)) {$myflag = 0;} #we are only interested in the header of the paper, so if you find abstract #and you have found author/address then end loop last if ($authorfield =~ /^\{abst(?:\w)+/ && !$ambiguous_field); } #Skip files that do not have author/address next if ($ambiguous_field); #Check zip codes my $author_only = 0; $aux1 = 0; my $aux2 = 0; $valid_zip = 0; foreach my $adr (@author) { #Look for zip code if field is address or if there is no address field in the @author array #(in which case the address will be in the author field) if ($adr =~ /address/ || !(grep {/address/} @author)) { print STDERR "test5......................\n"; #$adr =~ /address: \{([^=]+,+[^=]+)\}/; $adr =~ /address: \{([^=]+,+[^=]+)\}/; print STDERR "test6......................\n"; if($adr =~ /address: \{([^=]+,+[^=]+)\}/) { $adr = $1; print STDERR "test6......................\n"; $adr =~ s/\n/ /g; #$adr =~ s/\\/ /g; $adr =~ s/{//g; $adr =~ s/}//g; $adr =~ s/\.$//g; $adr =~ s/\s+/ /g; $adr =~ s/\\+\s*$//g; print STDERR "test4......................\n"; #Let's check multiple adresses in a row.. #if ( $adr =~ /\$\^\S+\$/){ # These commands transform numbered separators into \instA. Further versions of the code could extract author-address relationship $adr =~ s/\$\^?\S+\s?\S*\$/\\instA/g; $adr =~ s/\\+\s?\d\./\\instA/g; $adr =~ s/\\+\s?\d\-/\\instA/g; $adr =~ s/\\+\s?\S\)/\\instA/g; $adr =~ s/\\+\s?\d\)/\\instA/g; print STDERR "test3......................\n"; #These ones transform different usages of \and into \instA $adr =~ s/\\+\s?and\s?/\\instA/g; $adr =~ s/,?\sand\s?\\+/\\instA/g; $adr =~ s/\\newline/\\instA/g; $adr =~ s/;\s\\+/\\instA/g; $adr =~ s/,\\+/\\instA/g; $adr =~ s/\\medskip/\\instA/g; if ( $adr=~/(\\inst)/ ){ # Separates multiple addresses linked by a \inst my $pos = rindex($adr, "\\inst"); while ($pos > 0){ my $resul = substr($adr,$pos+6); $resul =~ s/,?\s*\\\*s*\\\*s*\\*$//g; $resul =~ s/^\\,\s//g; $resul =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g; $resul =~ s/[^\s]+\@[^\=]+//g; $resul =~ s/\\S+$//g; $resul =~ s/\\\s\\[^=]+$//g; $resul =~ s/\\+\s?\[\\affilskip\]\s*$//g; $resul =~ s/\,\s?$//g; $resul =~ s/\.$//g; $resul =~ s/\;$//g; $resul =~ s/\;\s+\S+\.\S+\.?\S*$//g; #$resul =~ s/\\+\s+\S+\.\S+\.?\S*$//g; $resul =~ s/\\+\s*$//g; if (length($resul) > 10) {print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$resul\n";} $adr = substr($adr,0,$pos); $pos = rindex($adr, "\\inst", $pos+6); if (($pos == -1) and (length($adr) > 10)) { #there is no \inst at the beginning. Print the remaining of the string and leave. $adr =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g; $adr =~ s/[^\s]+\@[^\=]+//g; $adr =~ s/\\+\s?\[\\affilskip\]\s*$//g; $adr =~ s/\\S+$//g; $adr =~ s/\\\s\\[^=]+$//g; $adr =~ s/\,\s?$//g; $adr =~ s/\.$//g; $adr =~ s/\;$//g; $adr =~ s/\;\s+\S+\.\S+\.?\S*$//g; #$adr =~ s/\\+\s+\S+\.\S+\.?\S*$//g; $adr =~ s/\\+\s*$//g; print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$adr\n"; } } } else { $adr =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g; $adr =~ s/[^\s]+\@[^\=]+//g; $adr =~ s/\\S+$//g; $adr =~ s/\\\s\\[^\=]+$//g; $adr =~ s/\\+\s?\[\\affilskip\]\s*$//g; $adr =~ s/\,\s?$//g; $adr =~ s/\.$//g; $adr =~ s/\;\s+\S+\.\S+\.?\S*$//g; #$adr =~ s/\\+\s+\S+\.\S+\.?\S*$//g; $adr =~ s/\\+\s*$//g; print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$adr\n"; } } } } } } close LAT_LON_OUT_FILE;
A \ Command I don't understand at /Applications/XAMPP/xamppfiles/lib/perl5/site_perl/5.10.0/LaTex/Parser.pm line 115.
Can sumbody sugest what could be the problem and the solution to it,
I am using XAMPP server, perl 5.10.1 and LaTex:: Parser module to do this. Thanks
Last edited by ajay_p5; 21 Days Ago at 1:20 pm.
![]() |
Similar Threads
- CPAN Perl module Installation Problem on Mac (Perl)
- Parser homework problem -- need help! (C++)
- Module Installation Problem (Perl)
- LWP problem (Perl)
- Basic Class Module Problem (Visual Basic 4 / 5 / 6)
- Module error? and gasp module problem (Python)
- Problem in Unicode Parsing. (Java)
- "use" statement inside an eval: problem? (Perl)
Other Threads in the Perl Forum
- Previous Thread: module caching in mod_perl
- Next Thread: php coding
| Thread Tools | Search this Thread |





