#!/usr/bin/perl use strict; use warnings; use LWP::Simple; use XML::Feed; # The feed only has the latest 50 entries but that's good enough # for our purposes my $lu_feed = 'http://archive.org/services/collection-rss.php?collection=laurentianuniversitystudentnewspapers'; my $feed = XML::Feed->parse(URI->new($lu_feed)) or die XML::Feed->errstr; foreach ($feed->entries) { my $details = get($_->link()) or die $!; # Find the link to the full text version $details =~ s{^.*Full Text.*$}{$1}s; # Some issues have not yet been OCRed and thus do not have full text; # skip in that case next if length($details) > 100; extract_text("http://archive.org/$details"); } print "\\.\n"; exit(); sub extract_text { my $text_url = shift; my ($baseurl, $volume, $issue) = ($text_url =~ m/^(.*)\/.*?vol_(\d+)_(\d+)_/); next unless $baseurl; $baseurl =~ s{/stream/}{/details/}; my $text = get($text_url . '') or die $!; $text =~ s{^.*
(.*?).*$}{$1}s; # Naive dehyphenation... that seems to work well $text =~ s{- \n}{}sg; # Remove explicit nulls - weird but oh well $text =~ s{\\0}{}sg; # Line everything up $text =~ s{\n}{\\n}sg; print "$baseurl\t$text_url\t$volume\t$issue\t$text\n"; }