944,110 Members | Top Members by Rank

Ad:
  • Perl Discussion Thread
  • Unsolved
  • Views: 1033
  • Perl RSS
Nov 6th, 2009
0

LaTex::parser module problem

Expand Post »
Hi everyone,

I am using this code to parse data from latex files :
Perl Syntax (Toggle Plain Text)
  1. #!/Applications/xampp/xamppfiles/bin/perl
  2. use strict;
  3. use File::Copy;
  4. use LaTex::Parser;
  5. use Text::CSV_XS;
  6. use utf8;
  7. use Unicode::String;
  8.  
  9. Unicode::String->stringify_as( 'utf8' ); # utf8 already is the default
  10.  
  11.  
  12. #see Ch 7.1 of Data Munging with Perl
  13.  
  14. my $output = "ID+add.txt";
  15.  
  16. open (LAT_LON_OUT_FILE, "> $output") or die "Couldn't open $output for writing: $!\n";
  17. binmode LAT_LON_OUT_FILE, ":utf8";
  18.  
  19.  
  20.  
  21.  
  22. #This routine is supposed to work in the dir where the .tar.gz files are
  23. #It extracts each .tar.gz file into a temp directory
  24. my $tdir = 'temp';
  25. mkdir $tdir;
  26.  
  27. my @flist=<*.tar.gz>;
  28. chdir $tdir;
  29. my $not_processed = 0; #Number of files that the script cannot process
  30. my $number_of_papers = 0;
  31. my $file_cat = " ";
  32.  
  33. #Go through all the .tar.gz files in current dir
  34. foreach my $item (@flist)
  35. {
  36. unlink glob "*.*"; #erase all files in dir $tdir
  37.  
  38. #go through .tar.gz files and uncompress them
  39. copy "../$item", "$item"; #copy .gz file to temp dir ...
  40. print"$item\n";
  41. system "gzip -dnv $item"; #ungzip ...
  42. my $ftar = substr($item,0,-3); #remove extension .gz ...
  43. print"$ftar\n";
  44. my $tar_error = system "tar -xm -f $ftar"; #and untar
  45.  
  46. #untar may not work if archive has only one file. In that case
  47. #gzip gave already a tex file -just need to rename it.
  48. if ($tar_error)
  49. {
  50. rename "$ftar", substr($ftar,0,-4).".tex";
  51. }
  52. else
  53. {
  54. unlink $ftar; #tar file needs to be deleted, otherwise it will be in @textlist
  55. }
  56.  
  57.  
  58. # Let's extract the data of the corresponding ABS file first..
  59.  
  60. my $file_ID = substr($ftar,0,-4);
  61.  
  62. my $input_file = "../". $file_ID . ".abs";
  63. open (MYINPUTFILE, "$input_file") or die "Couldn't open $input_file for reading: $!\n"; # open for input
  64. my $file_content = "FILE: \n";
  65. while ( <MYINPUTFILE> )
  66. {
  67. $file_content = $file_content . $_;
  68. }
  69. close(MYINPUTFILE);
  70. #$file_content =~ s/\n/\t/g; #to make more clear where are the separators..
  71. # print "$file_content\n";sleep 5;
  72. $file_content =~ /\nDate:\s\S+,\s([^\n]+)\s\s\(/;
  73. my $file_date = $1;
  74.  
  75. $file_content =~ /\nTitle:\s([^\n]+)/;
  76. my $file_tit = $1;
  77.  
  78. $file_content =~ /\nAuthors:\s([^\n]+)/;
  79. my $file_auth = $1;
  80.  
  81. $file_content =~ /\nCategories:\s([^\n]+)/;
  82. $file_cat = $1;
  83.  
  84.  
  85.  
  86.  
  87.  
  88.  
  89.  
  90. #process TeX files. Note that some of the files had extension .tex whereas others had .latex or .txt
  91. #note that there is a problem in looking just for <*.*t*> -you also get .sty files, which are style files!
  92. my @texlist;
  93. my @texlist1 = <*.latex>;
  94. my @texlist2 = <*.tex>;
  95. my @texlist3 = <*.txt>;
  96.  
  97. push(@texlist, @texlist1);
  98. push(@texlist, @texlist2);
  99. push(@texlist, @texlist3);
  100.  
  101. my $valid_zip = 0; #Is zip code valid?
  102. my @author; #Array with authors/addresses
  103. my $author_index = 0; #Index into @author
  104. my $aux1 = 0; #American-like post code detected?
  105. my $myflag = 0;
  106.  
  107. print STDERR "test100......................\n";
  108.  
  109. #Go through .TeX files in dir $tdir and assign @author
  110. foreach my $texfile (@texlist)
  111. {
  112. #print LAT_LON_OUT_FILE "$ftar \n";
  113.  
  114. my $l = new LaTeX::Parser 'file' => $texfile;
  115.  
  116. printf("aaaa %s\n", $texfile);
  117.  
  118.  
  119. print STDERR "test100......................\n";
  120. my $p = $l->latex;
  121. print STDERR "test100......................\n";
  122.  
  123. my $next_field = 0; #if $next_field=1 then next field is author; $next_field=2 then next field is address
  124. my $ambiguous_field = 1; #sometimes the author and address are ambiguous. Detect these.
  125.  
  126.  
  127.  
  128. #Go through TeX fields in files and assign array @author
  129. for (my $i=0; $i<=$#{$p}; $i++)
  130. {
  131.  
  132. my $authorfield = $p->[$i]; #contains LaTeX fields
  133.  
  134.  
  135. #Some author/address fields are empty others don't have a LateX field '{' -skip them
  136. next if (($next_field == 1 || $next_field == 2) && (!($authorfield =~ /\w/) || !($authorfield =~ /\{/)));
  137.  
  138. #Assign author field]
  139. if ($next_field == 1 || ($authorfield =~ /\\large/i) || ($authorfield =~ /\\small/i))
  140. {
  141. $author[$author_index] = "author: ".$authorfield;
  142. $next_field = 0;
  143. $author_index++;
  144. $ambiguous_field = 0; #We have detected an author field, so file is unambiguous
  145. }
  146.  
  147. #Assign address field
  148. if ($next_field == 2)
  149. {
  150. $author[$author_index] = "address: ".$authorfield;
  151. $next_field = 0;
  152. $author_index++;
  153. $ambiguous_field = 0; #We have detected an address field, so file is unambiguous
  154. }
  155.  
  156. #next field to be read will be the author
  157. $next_field = 1 if (($authorfield =~ /\\aut(?:\w)+/) || ($authorfield =~ /\\large/i) || ($authorfield =~ /\\center/i) || ($authorfield =~ /\\small/i));
  158.  
  159. #next field to be read will be the address
  160. $next_field = 2 if ($authorfield =~ /\\add(?:\w)+/) || ($authorfield =~/\\affi(?:\w)+/) || ($authorfield =~ /\\inst(?:\w)*/) || (($authorfield =~/\{\d\}/) and ($myflag == 1));
  161.  
  162. #two fields after will be an address
  163. if ($authorfield =~/\\altaffiltext\s?/) { $myflag = 1;}
  164. #if (($authorfield =~/\{\d\}/) and ($myflag == 1)){ $myflag = 2;}
  165. if (($authorfield =~/\{([^=]+,+[^=]+)\}/) and ($myflag == 1)) {$myflag = 0;}
  166.  
  167. #we are only interested in the header of the paper, so if you find abstract
  168. #and you have found author/address then end loop
  169. last if ($authorfield =~ /^\{abst(?:\w)+/ && !$ambiguous_field);
  170. }
  171.  
  172. #Skip files that do not have author/address
  173. next if ($ambiguous_field);
  174.  
  175.  
  176.  
  177.  
  178. #Check zip codes
  179. my $author_only = 0;
  180. $aux1 = 0; my $aux2 = 0;
  181. $valid_zip = 0;
  182. foreach my $adr (@author)
  183. {
  184.  
  185.  
  186. #Look for zip code if field is address or if there is no address field in the @author array
  187. #(in which case the address will be in the author field)
  188. if ($adr =~ /address/ || !(grep {/address/} @author))
  189. {
  190.  
  191.  
  192. print STDERR "test5......................\n";
  193. #$adr =~ /address: \{([^=]+,+[^=]+)\}/;
  194. $adr =~ /address: \{([^=]+,+[^=]+)\}/;
  195. print STDERR "test6......................\n";
  196. if($adr =~ /address: \{([^=]+,+[^=]+)\}/) {
  197.  
  198. $adr = $1;
  199. print STDERR "test6......................\n";
  200. $adr =~ s/\n/ /g;
  201. #$adr =~ s/\\/ /g;
  202. $adr =~ s/{//g;
  203. $adr =~ s/}//g;
  204. $adr =~ s/\.$//g;
  205. $adr =~ s/\s+/ /g;
  206. $adr =~ s/\\+\s*$//g;
  207.  
  208.  
  209. print STDERR "test4......................\n";
  210. #Let's check multiple adresses in a row..
  211. #if ( $adr =~ /\$\^\S+\$/){
  212. # These commands transform numbered separators into \instA. Further versions of the code could extract author-address relationship
  213. $adr =~ s/\$\^?\S+\s?\S*\$/\\instA/g;
  214. $adr =~ s/\\+\s?\d\./\\instA/g;
  215. $adr =~ s/\\+\s?\d\-/\\instA/g;
  216. $adr =~ s/\\+\s?\S\)/\\instA/g;
  217. $adr =~ s/\\+\s?\d\)/\\instA/g;
  218.  
  219. print STDERR "test3......................\n";
  220. #These ones transform different usages of \and into \instA
  221. $adr =~ s/\\+\s?and\s?/\\instA/g;
  222. $adr =~ s/,?\sand\s?\\+/\\instA/g;
  223. $adr =~ s/\\newline/\\instA/g;
  224. $adr =~ s/;\s\\+/\\instA/g;
  225. $adr =~ s/,\\+/\\instA/g;
  226. $adr =~ s/\\medskip/\\instA/g;
  227.  
  228.  
  229. if ( $adr=~/(\\inst)/ ){ # Separates multiple addresses linked by a \inst
  230. my $pos = rindex($adr, "\\inst");
  231. while ($pos > 0){
  232. my $resul = substr($adr,$pos+6);
  233. $resul =~ s/,?\s*\\\*s*\\\*s*\\*$//g;
  234. $resul =~ s/^\\,\s//g;
  235. $resul =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
  236. $resul =~ s/[^\s]+\@[^\=]+//g;
  237. $resul =~ s/\\S+$//g;
  238. $resul =~ s/\\\s\\[^=]+$//g;
  239. $resul =~ s/\\+\s?\[\\affilskip\]\s*$//g;
  240. $resul =~ s/\,\s?$//g;
  241. $resul =~ s/\.$//g;
  242. $resul =~ s/\;$//g;
  243. $resul =~ s/\;\s+\S+\.\S+\.?\S*$//g;
  244. #$resul =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
  245. $resul =~ s/\\+\s*$//g;
  246. if (length($resul) > 10) {print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$resul\n";}
  247. $adr = substr($adr,0,$pos);
  248. $pos = rindex($adr, "\\inst", $pos+6);
  249. if (($pos == -1) and (length($adr) > 10)) { #there is no \inst at the beginning. Print the remaining of the string and leave.
  250. $adr =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
  251. $adr =~ s/[^\s]+\@[^\=]+//g;
  252. $adr =~ s/\\+\s?\[\\affilskip\]\s*$//g;
  253. $adr =~ s/\\S+$//g;
  254. $adr =~ s/\\\s\\[^=]+$//g;
  255. $adr =~ s/\,\s?$//g;
  256. $adr =~ s/\.$//g;
  257. $adr =~ s/\;$//g;
  258. $adr =~ s/\;\s+\S+\.\S+\.?\S*$//g;
  259. #$adr =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
  260. $adr =~ s/\\+\s*$//g;
  261. print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$adr\n";
  262. }
  263. }
  264. }
  265.  
  266.  
  267. else {
  268. $adr =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
  269. $adr =~ s/[^\s]+\@[^\=]+//g;
  270. $adr =~ s/\\S+$//g;
  271. $adr =~ s/\\\s\\[^\=]+$//g;
  272. $adr =~ s/\\+\s?\[\\affilskip\]\s*$//g;
  273. $adr =~ s/\,\s?$//g;
  274. $adr =~ s/\.$//g;
  275. $adr =~ s/\;\s+\S+\.\S+\.?\S*$//g;
  276. #$adr =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
  277. $adr =~ s/\\+\s*$//g;
  278. print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$adr\n";
  279.  
  280. }
  281. }
  282.  
  283. }
  284. }
  285.  
  286. }
  287.  
  288.  
  289. }
  290.  
  291. close LAT_LON_OUT_FILE;
But I am getting an error...like this:
A \ Command I don't understand at /Applications/XAMPP/xamppfiles/lib/perl5/site_perl/5.10.0/LaTex/Parser.pm line 115.

Can sumbody sugest what could be the problem and the solution to it,
I am using XAMPP server, perl 5.10.1 and LaTex:: Parser module to do this. Thanks
Last edited by ajay_p5; Nov 6th, 2009 at 1:20 pm.
Similar Threads
Reputation Points: 10
Solved Threads: 0
Light Poster
ajay_p5 is offline Offline
29 posts
since Apr 2009

This thread is more than three months old

No one has posted to this discussion for at least three months. Please let old threads die and do not reply to them unless you feel you have something new and valuable to contribute that absolutely must be added to make the discussion complete. Otherwise, please start a new thread in this forum instead.
Message:
Previous Thread in Perl Forum Timeline: module caching in mod_perl
Next Thread in Perl Forum Timeline: php coding





About Us | Contact Us | Advertise | Acceptable Use Policy
Forum Index | Build Custom RSS Feed


Follow us on Twitter


© 2011 DaniWeb® LLC