#!/usr/bin/perl
use strict;
use warnings;

use LWP::Simple;
use XML::Feed;

# The feed only has the latest 50 entries but that's good enough
# for our purposes
my $lu_feed = 'http://archive.org/services/collection-rss.php?collection=laurentianuniversitystudentnewspapers';
my $feed = XML::Feed->parse(URI->new($lu_feed)) or die XML::Feed->errstr;

foreach ($feed->entries) {
    my $details = get($_->link()) or die $!;

    # Find the link to the full text version
    $details =~ s{^.*<a href="/(stream/.*?\.txt)">Full Text</a>.*$}{$1}s;

    # Some issues have not yet been OCRed and thus do not have full text;
    # skip in that case
    next if length($details) > 100;

    extract_text("http://archive.org/$details");
}

print "\\.\n";

exit();

sub extract_text {
    my $text_url = shift;

    my ($baseurl, $volume, $issue) = ($text_url =~ m/^(.*)\/.*?vol_(\d+)_(\d+)_/);
    next unless $baseurl;
    $baseurl =~ s{/stream/}{/details/};

    my $text = get($text_url . '') or die $!;
    $text =~ s{^.*<pre>(.*?)</pre>.*$}{$1}s;

    # Naive dehyphenation... that seems to work well
    $text =~ s{- \n}{}sg;
    # Remove explicit nulls - weird but oh well
    $text =~ s{\\0}{}sg;
    # Line everything up
    $text =~ s{\n}{\\n}sg;

    print "$baseurl\t$text_url\t$volume\t$issue\t$text\n";
}
