LaTex::parser module problem

Reply

Join Date: Apr 2009
Posts: 19
Reputation: ajay_p5 is an unknown quantity at this point 
Solved Threads: 0
ajay_p5 ajay_p5 is offline Offline
Newbie Poster

LaTex::parser module problem

 
0
  #1
21 Days Ago
Hi everyone,

I am using this code to parse data from latex files :
  1. #!/Applications/xampp/xamppfiles/bin/perl
  2. use strict;
  3. use File::Copy;
  4. use LaTex::Parser;
  5. use Text::CSV_XS;
  6. use utf8;
  7. use Unicode::String;
  8.  
  9. Unicode::String->stringify_as( 'utf8' ); # utf8 already is the default
  10.  
  11.  
  12. #see Ch 7.1 of Data Munging with Perl
  13.  
  14. my $output = "ID+add.txt";
  15.  
  16. open (LAT_LON_OUT_FILE, "> $output") or die "Couldn't open $output for writing: $!\n";
  17. binmode LAT_LON_OUT_FILE, ":utf8";
  18.  
  19.  
  20.  
  21.  
  22. #This routine is supposed to work in the dir where the .tar.gz files are
  23. #It extracts each .tar.gz file into a temp directory
  24. my $tdir = 'temp';
  25. mkdir $tdir;
  26.  
  27. my @flist=<*.tar.gz>;
  28. chdir $tdir;
  29. my $not_processed = 0; #Number of files that the script cannot process
  30. my $number_of_papers = 0;
  31. my $file_cat = " ";
  32.  
  33. #Go through all the .tar.gz files in current dir
  34. foreach my $item (@flist)
  35. {
  36. unlink glob "*.*"; #erase all files in dir $tdir
  37.  
  38. #go through .tar.gz files and uncompress them
  39. copy "../$item", "$item"; #copy .gz file to temp dir ...
  40. print"$item\n";
  41. system "gzip -dnv $item"; #ungzip ...
  42. my $ftar = substr($item,0,-3); #remove extension .gz ...
  43. print"$ftar\n";
  44. my $tar_error = system "tar -xm -f $ftar"; #and untar
  45.  
  46. #untar may not work if archive has only one file. In that case
  47. #gzip gave already a tex file -just need to rename it.
  48. if ($tar_error)
  49. {
  50. rename "$ftar", substr($ftar,0,-4).".tex";
  51. }
  52. else
  53. {
  54. unlink $ftar; #tar file needs to be deleted, otherwise it will be in @textlist
  55. }
  56.  
  57.  
  58. # Let's extract the data of the corresponding ABS file first..
  59.  
  60. my $file_ID = substr($ftar,0,-4);
  61.  
  62. my $input_file = "../". $file_ID . ".abs";
  63. open (MYINPUTFILE, "$input_file") or die "Couldn't open $input_file for reading: $!\n"; # open for input
  64. my $file_content = "FILE: \n";
  65. while ( <MYINPUTFILE> )
  66. {
  67. $file_content = $file_content . $_;
  68. }
  69. close(MYINPUTFILE);
  70. #$file_content =~ s/\n/\t/g; #to make more clear where are the separators..
  71. # print "$file_content\n";sleep 5;
  72. $file_content =~ /\nDate:\s\S+,\s([^\n]+)\s\s\(/;
  73. my $file_date = $1;
  74.  
  75. $file_content =~ /\nTitle:\s([^\n]+)/;
  76. my $file_tit = $1;
  77.  
  78. $file_content =~ /\nAuthors:\s([^\n]+)/;
  79. my $file_auth = $1;
  80.  
  81. $file_content =~ /\nCategories:\s([^\n]+)/;
  82. $file_cat = $1;
  83.  
  84.  
  85.  
  86.  
  87.  
  88.  
  89.  
  90. #process TeX files. Note that some of the files had extension .tex whereas others had .latex or .txt
  91. #note that there is a problem in looking just for <*.*t*> -you also get .sty files, which are style files!
  92. my @texlist;
  93. my @texlist1 = <*.latex>;
  94. my @texlist2 = <*.tex>;
  95. my @texlist3 = <*.txt>;
  96.  
  97. push(@texlist, @texlist1);
  98. push(@texlist, @texlist2);
  99. push(@texlist, @texlist3);
  100.  
  101. my $valid_zip = 0; #Is zip code valid?
  102. my @author; #Array with authors/addresses
  103. my $author_index = 0; #Index into @author
  104. my $aux1 = 0; #American-like post code detected?
  105. my $myflag = 0;
  106.  
  107. print STDERR "test100......................\n";
  108.  
  109. #Go through .TeX files in dir $tdir and assign @author
  110. foreach my $texfile (@texlist)
  111. {
  112. #print LAT_LON_OUT_FILE "$ftar \n";
  113.  
  114. my $l = new LaTeX::Parser 'file' => $texfile;
  115.  
  116. printf("aaaa %s\n", $texfile);
  117.  
  118.  
  119. print STDERR "test100......................\n";
  120. my $p = $l->latex;
  121. print STDERR "test100......................\n";
  122.  
  123. my $next_field = 0; #if $next_field=1 then next field is author; $next_field=2 then next field is address
  124. my $ambiguous_field = 1; #sometimes the author and address are ambiguous. Detect these.
  125.  
  126.  
  127.  
  128. #Go through TeX fields in files and assign array @author
  129. for (my $i=0; $i<=$#{$p}; $i++)
  130. {
  131.  
  132. my $authorfield = $p->[$i]; #contains LaTeX fields
  133.  
  134.  
  135. #Some author/address fields are empty others don't have a LateX field '{' -skip them
  136. next if (($next_field == 1 || $next_field == 2) && (!($authorfield =~ /\w/) || !($authorfield =~ /\{/)));
  137.  
  138. #Assign author field]
  139. if ($next_field == 1 || ($authorfield =~ /\\large/i) || ($authorfield =~ /\\small/i))
  140. {
  141. $author[$author_index] = "author: ".$authorfield;
  142. $next_field = 0;
  143. $author_index++;
  144. $ambiguous_field = 0; #We have detected an author field, so file is unambiguous
  145. }
  146.  
  147. #Assign address field
  148. if ($next_field == 2)
  149. {
  150. $author[$author_index] = "address: ".$authorfield;
  151. $next_field = 0;
  152. $author_index++;
  153. $ambiguous_field = 0; #We have detected an address field, so file is unambiguous
  154. }
  155.  
  156. #next field to be read will be the author
  157. $next_field = 1 if (($authorfield =~ /\\aut(?:\w)+/) || ($authorfield =~ /\\large/i) || ($authorfield =~ /\\center/i) || ($authorfield =~ /\\small/i));
  158.  
  159. #next field to be read will be the address
  160. $next_field = 2 if ($authorfield =~ /\\add(?:\w)+/) || ($authorfield =~/\\affi(?:\w)+/) || ($authorfield =~ /\\inst(?:\w)*/) || (($authorfield =~/\{\d\}/) and ($myflag == 1));
  161.  
  162. #two fields after will be an address
  163. if ($authorfield =~/\\altaffiltext\s?/) { $myflag = 1;}
  164. #if (($authorfield =~/\{\d\}/) and ($myflag == 1)){ $myflag = 2;}
  165. if (($authorfield =~/\{([^=]+,+[^=]+)\}/) and ($myflag == 1)) {$myflag = 0;}
  166.  
  167. #we are only interested in the header of the paper, so if you find abstract
  168. #and you have found author/address then end loop
  169. last if ($authorfield =~ /^\{abst(?:\w)+/ && !$ambiguous_field);
  170. }
  171.  
  172. #Skip files that do not have author/address
  173. next if ($ambiguous_field);
  174.  
  175.  
  176.  
  177.  
  178. #Check zip codes
  179. my $author_only = 0;
  180. $aux1 = 0; my $aux2 = 0;
  181. $valid_zip = 0;
  182. foreach my $adr (@author)
  183. {
  184.  
  185.  
  186. #Look for zip code if field is address or if there is no address field in the @author array
  187. #(in which case the address will be in the author field)
  188. if ($adr =~ /address/ || !(grep {/address/} @author))
  189. {
  190.  
  191.  
  192. print STDERR "test5......................\n";
  193. #$adr =~ /address: \{([^=]+,+[^=]+)\}/;
  194. $adr =~ /address: \{([^=]+,+[^=]+)\}/;
  195. print STDERR "test6......................\n";
  196. if($adr =~ /address: \{([^=]+,+[^=]+)\}/) {
  197.  
  198. $adr = $1;
  199. print STDERR "test6......................\n";
  200. $adr =~ s/\n/ /g;
  201. #$adr =~ s/\\/ /g;
  202. $adr =~ s/{//g;
  203. $adr =~ s/}//g;
  204. $adr =~ s/\.$//g;
  205. $adr =~ s/\s+/ /g;
  206. $adr =~ s/\\+\s*$//g;
  207.  
  208.  
  209. print STDERR "test4......................\n";
  210. #Let's check multiple adresses in a row..
  211. #if ( $adr =~ /\$\^\S+\$/){
  212. # These commands transform numbered separators into \instA. Further versions of the code could extract author-address relationship
  213. $adr =~ s/\$\^?\S+\s?\S*\$/\\instA/g;
  214. $adr =~ s/\\+\s?\d\./\\instA/g;
  215. $adr =~ s/\\+\s?\d\-/\\instA/g;
  216. $adr =~ s/\\+\s?\S\)/\\instA/g;
  217. $adr =~ s/\\+\s?\d\)/\\instA/g;
  218.  
  219. print STDERR "test3......................\n";
  220. #These ones transform different usages of \and into \instA
  221. $adr =~ s/\\+\s?and\s?/\\instA/g;
  222. $adr =~ s/,?\sand\s?\\+/\\instA/g;
  223. $adr =~ s/\\newline/\\instA/g;
  224. $adr =~ s/;\s\\+/\\instA/g;
  225. $adr =~ s/,\\+/\\instA/g;
  226. $adr =~ s/\\medskip/\\instA/g;
  227.  
  228.  
  229. if ( $adr=~/(\\inst)/ ){ # Separates multiple addresses linked by a \inst
  230. my $pos = rindex($adr, "\\inst");
  231. while ($pos > 0){
  232. my $resul = substr($adr,$pos+6);
  233. $resul =~ s/,?\s*\\\*s*\\\*s*\\*$//g;
  234. $resul =~ s/^\\,\s//g;
  235. $resul =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
  236. $resul =~ s/[^\s]+\@[^\=]+//g;
  237. $resul =~ s/\\S+$//g;
  238. $resul =~ s/\\\s\\[^=]+$//g;
  239. $resul =~ s/\\+\s?\[\\affilskip\]\s*$//g;
  240. $resul =~ s/\,\s?$//g;
  241. $resul =~ s/\.$//g;
  242. $resul =~ s/\;$//g;
  243. $resul =~ s/\;\s+\S+\.\S+\.?\S*$//g;
  244. #$resul =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
  245. $resul =~ s/\\+\s*$//g;
  246. if (length($resul) > 10) {print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$resul\n";}
  247. $adr = substr($adr,0,$pos);
  248. $pos = rindex($adr, "\\inst", $pos+6);
  249. if (($pos == -1) and (length($adr) > 10)) { #there is no \inst at the beginning. Print the remaining of the string and leave.
  250. $adr =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
  251. $adr =~ s/[^\s]+\@[^\=]+//g;
  252. $adr =~ s/\\+\s?\[\\affilskip\]\s*$//g;
  253. $adr =~ s/\\S+$//g;
  254. $adr =~ s/\\\s\\[^=]+$//g;
  255. $adr =~ s/\,\s?$//g;
  256. $adr =~ s/\.$//g;
  257. $adr =~ s/\;$//g;
  258. $adr =~ s/\;\s+\S+\.\S+\.?\S*$//g;
  259. #$adr =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
  260. $adr =~ s/\\+\s*$//g;
  261. print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$adr\n";
  262. }
  263. }
  264. }
  265.  
  266.  
  267. else {
  268. $adr =~ s/(email|E-mail|e-mail):?\s*[^\s]+//g;
  269. $adr =~ s/[^\s]+\@[^\=]+//g;
  270. $adr =~ s/\\S+$//g;
  271. $adr =~ s/\\\s\\[^\=]+$//g;
  272. $adr =~ s/\\+\s?\[\\affilskip\]\s*$//g;
  273. $adr =~ s/\,\s?$//g;
  274. $adr =~ s/\.$//g;
  275. $adr =~ s/\;\s+\S+\.\S+\.?\S*$//g;
  276. #$adr =~ s/\\+\s+\S+\.\S+\.?\S*$//g;
  277. $adr =~ s/\\+\s*$//g;
  278. print LAT_LON_OUT_FILE "$file_ID\t$file_cat\t$adr\n";
  279.  
  280. }
  281. }
  282.  
  283. }
  284. }
  285.  
  286. }
  287.  
  288.  
  289. }
  290.  
  291. close LAT_LON_OUT_FILE;
But I am getting an error...like this:
A \ Command I don't understand at /Applications/XAMPP/xamppfiles/lib/perl5/site_perl/5.10.0/LaTex/Parser.pm line 115.

Can sumbody sugest what could be the problem and the solution to it,
I am using XAMPP server, perl 5.10.1 and LaTex:: Parser module to do this. Thanks
Last edited by ajay_p5; 21 Days Ago at 1:20 pm.
Reply With Quote Quick reply to this message  
Reply

Message:


Thread Tools Search this Thread



About Us | Contact Us | Advertise | DaniWeb | Acceptable Use Policy | RSS Feed

©2003 - 2009 DaniWeb® LLC