23-01-2011

Perl Scrape HTML and output RSS

This is just a quick simple example of scraping a html page's content and turning it into something else (RSS) using Perl regular expressions and the XML::RSS module.

#!/usr/bin/perl -w
 
use strict;
use LWP::UserAgent;
use HTML::Entities;
use XML::RSS;
use DateTime::Format::Mail;
 
my $ie="Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)";
 
my $ua = LWP::UserAgent->new;
 
$ua->agent($ie);
 
my $url = "http://www.somesiteorother.com/upcoming-events/month-calendar";
my $tz = 'Europe/Amsterdam';
my $d = DateTime::Format::Mail->format_datetime( DateTime->now(time_zone => $tz));
 
my $response = $ua->get ($url);
my ($content,$date );
my $rss = XML::RSS->new (version => '2.0');
 
 
if ($response->is_success) 
{
	$rss->channel(title          => 'Some site or other',
	              link           => $url,
	              language       => 'en',
	              pubDate        => $d,
	);
 
	my @chunks = (split (/<div [^>]* class="event">/, $response->content));
 
	foreach (@chunks)
	{
		if (/DATE=(\d{4}-\d{2}-\d{2})/)
		{
			$d = $1;
			$d =~ /(\d{4})-(\d{2})-(\d{2})/;
			$d = DateTime::Format::Mail->format_datetime( DateTime->new(
	      		year       => $1,
	      		month      => $2,
	      		day        => $3,
	      		time_zone  => $tz));
 
			$_ =~ /<a href="(.+)"\s[^>]+>([^<]+)<\/a>/;
 
			$rss->add_item(title => $2,
			        link  => $1,
					pubDate=> $d
			);
 
		}
 
	}
}
else 
{
	die $response->status_line;
}
 
 
print $rss->as_string;
 
#$rss->save("file.rss");
 

Comments:

Your comment:

»

 

[x]