AppleTastic 0 Newbie Poster

Hi, So my intention is to grab a relative html file and extract all relative links and then open those links to extract the relative links from those files and so on.

Pretty much I need to build an html tree from an initial html file.

I have written some code below that takes arguments for relative or absolute with file name and outputs all links to a text file, the only problem now is I need to pass back those lines of filenames so that the program can run recursively and grab all chained html pages.

To run, I enter in command prompt filename.pl -r filename.htm

Here is my code any suggestions?
Thanks in advance!

#!/usr/bin/perl -w
use strict;
use Getopt::Std;
use LWP::Simple;
use HTML::Parser;
#
# Grab all links from local or remote html file
# perl html munging
#
# option -a (/ -r) grabs only absolute (/ relative) urls
 
# get options and argument
#
my %opts;
getopts('ar', \%opts);
my $arg = shift;
die "Usage: $0 [-a | -r] filename [| URL]\n"
	if (not defined $arg or $opts{a} && $opts{r}); # allow either -a or -r
 
# get the page either from file or url
#
my $page;
if ($arg =~ m!^http://!) {
	$page = get($arg)
		or die "Couldn't get $arg: $!\n";
} 
else {
	open FH, "<", $arg
		or die "Couldn't open $arg: $!\n";
	$page = do { local $/; <FH> };
	close FH;
}
 
# set the parser and parse
#
my $parser = HTML::Parser->new( api_version => 3,
								start_h => [\&start,"tagname, attr"],
							 );
my @links;
sub start {
	my ($tag, $attr) = @_;
	if ($tag =~ /^a$/ and defined $attr->{href}) {
			return
				if ($attr->{href} =~ m!^http://! and $opts{r}); # exclude absolute url when -r
			return
				if ($attr->{href} !~ m!http://! and $opts{a});	# exclude relative url when -a
			push @links, $attr->{href};
	}
}
$parser->parse($page);
$parser->eof;	
 
# output
#


my $append = 0;
if ($append)
 {
 open(MYOUTFILE, ">TreeList.txt"); #open for write, overwrite
 }
else
 {
 open(MYOUTFILE, ">>TreeList.txt"); #open for write, append
 }
#print MYOUTFILE    

map {print MYOUTFILE "$_\n"} @links;


#*** Close the file ***
close(MYOUTFILE);
Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.