'required' => '0',
'min_ver' => '2.13',
},
+ 'Moo' => {
+ 'usage' => 'Core',
+ 'required' => '0',
+ 'min_ver' => '1',
+ },
'String::Random' => {
'usage' => 'OpacSelfRegistration',
'required' => '1',
'required' => '0',
'min_ver' => '0.0.3',
},
+ 'XML::Writer' => {
+ 'usage' => 'Command line scripts',
+ 'required' => '0',
+ 'min_ver' => '0.614',
+ },
};
1;
--- /dev/null
+package Koha::Sitemapper;
+
+#
+# Copyright 2015 Tamil s.a.r.l.
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 3 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Koha; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+use Moo;
+use Modern::Perl;
+use Koha::Sitemapper::Writer;
+use C4::Context;
+
+
+has url => ( is => 'rw', );
+
+has dir => (
+ is => 'rw',
+ default => '.',
+ trigger => sub {
+ my ($self, $dir) = @_;
+ unless (-d $dir) {
+ say "This is not a valid directory: $dir";
+ exit;
+ }
+ }
+);
+
+has short => ( is => 'rw', default => 1 );
+
+has verbose => ( is => 'rw', default => 0 );
+
+has sth => ( is => 'rw' );
+
+has writer => ( is => 'rw', );
+
+has count => ( is => 'rw', default => 0);
+
+
+sub run {
+ my $self = shift;
+
+ say "Creation of Sitemap files in '" . $self->dir . "' directory"
+ if $self->verbose;
+
+ $self->writer( Koha::Sitemapper::Writer->new( sitemapper => $self ) );
+ my $sth = C4::Context->dbh->prepare(
+ "SELECT biblionumber, timestamp FROM biblio" );
+ $sth->execute();
+ $self->sth($sth);
+
+ while ( $self->process() ) {
+ say "..... ", $self->count
+ if $self->verbose && $self->count % 10000 == 0;
+ }
+}
+
+
+sub process {
+ my $self = shift;
+
+ my ($biblionumber, $timestamp) = $self->sth->fetchrow;
+ unless ($biblionumber) {
+ $self->writer->end();
+ say "Number of biblio records processed: ", $self->count, "\n" .
+ "Number of Sitemap files: ", $self->writer->count
+ if $self->verbose;
+ return;
+ }
+
+ $self->writer->write($biblionumber, $timestamp);
+ $self->count( $self->count + 1 );
+ return $self->count;
+}
+
+
+1;
--- /dev/null
+package Koha::Sitemapper::Writer;
+
+#
+# Copyright 2015 Tamil s.a.r.l.
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 3 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Koha; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+
+use Moo;
+use Modern::Perl;
+use XML::Writer;
+use IO::File;
+use DateTime;
+
+
+my $MAX = 50000;
+
+
+has sitemapper => (is => 'rw', );
+
+has current => ( is => 'rw', default => $MAX );
+
+has count => ( is => 'rw', default => 0 );
+
+has writer => ( is => 'rw', );
+
+
+
+sub _writer_create {
+ my ($self, $name) = @_;
+ $name = $self->sitemapper->dir . "/$name";
+ my $fh = IO::File->new(">$name");
+ unless ($fh) {
+ say "Impossible to create file: $name";
+ exit;
+ }
+ my $writer = XML::Writer->new(
+ OUTPUT => $fh,
+ DATA_MODE => 1,
+ DATA_INDENT => 2,
+ );
+ $writer->xmlDecl("UTF-8");
+ return $writer;
+}
+
+
+sub _writer_end {
+ my $self = shift;
+ return unless $self->writer;
+ $self->writer->endTag();
+ $self->writer->end();
+ $self->writer->getOutput()->close();
+}
+
+
+sub write {
+ my ($self, $biblionumber, $timestamp) = @_;
+
+ if ( $self->current == $MAX ) {
+ $self->_writer_end();
+ $self->count( $self->count + 1 );
+ my $w = $self->_writer_create( sprintf("sitemap%04d.xml", $self->count) );
+ $w->startTag(
+ 'urlset',
+ 'xmlns' => 'http://www.sitemaps.org/schemas/sitemap/0.9',
+ 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
+ 'xsi:schemaLocation' => 'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd');
+ $self->writer($w);
+ $self->current(0);
+ }
+
+ $self->current( $self->current + 1 );
+ my $writer = $self->writer;
+ my $url = $self->sitemapper->url .
+ ($self->sitemapper->short ? '/bib/' : '/cgi-bin/koha/opac-detail.pl?biblionumber=') .
+ $biblionumber;
+ $writer->startTag('url');
+ $writer->startTag('loc');
+ $writer->characters($url);
+ $writer->endTag();
+ $writer->startTag('lastmod');
+ $timestamp = substr($timestamp, 0, 10);
+ $writer->characters($timestamp);
+ $writer->endTag();
+ $writer->endTag();
+}
+
+
+sub end {
+ my $self = shift;
+
+ $self->_writer_end();
+
+ my $w = $self->_writer_create("sitemapindex.xml");
+ $w->startTag('sitemapindex', 'xmlns' => 'http://www.sitemaps.org/schemas/sitemap/0.9');
+ my $now = DateTime->now()->ymd;
+ for my $i ( 1..$self->count ) {
+ $w->startTag('sitemap');
+ $w->startTag('loc');
+ my $name = sprintf("sitemap%04d.xml", $i);
+ $w->characters($self->sitemapper->url . "/$name");
+ $w->endTag();
+ $w->startTag('lastmod');
+ $w->characters($now);
+ $w->endTag();
+ $w->endTag();
+ }
+ $w->endTag();
+}
+
+
+1;
\ No newline at end of file
--- /dev/null
+#!/usr/bin/perl
+
+# Copyright 2015 Tamil s.a.r.l.
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 3 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Koha; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+package Main;
+
+use Modern::Perl;
+use utf8;
+use Pod::Usage;
+use Getopt::Long;
+use C4::Biblio;
+use Koha::Sitemapper;
+
+
+my ($verbose, $help, $url, $dir, $short) = (0, 0, '', '.', 1);
+GetOptions(
+ 'verbose' => \$verbose,
+ 'help' => \$help,
+ 'url=s' => \$url,
+ 'dir=s' => \$dir,
+ 'short!' => \$short,
+);
+
+sub usage {
+ pod2usage( -verbose => 2 );
+ exit;
+}
+
+usage() if $help;
+
+unless ($url) {
+ $url = C4::Context->preference("OPACBaseURL");
+ unless ($url) {
+ say "OPACBaseURL syspref isn't defined. You can use --url parameter.";
+ exit;
+ }
+ $url = 'http://' . $url;
+}
+$url =~ s/\/*$//g;
+
+my $sitemaper = Koha::Sitemapper->new(
+ verbose => $verbose,
+ url => $url,
+ dir => $dir,
+ short => $short,
+);
+$sitemaper->run();
+
+
+=head1 USAGE
+
+=over
+
+=item sitemap.pl [--verbose|--help|--short|--noshort|--url|--dir]
+
+=back
+
+=head1 SYNOPSIS
+
+ sitemap.pl --verbose
+ sitemap.pl --noshort --url /home/koha/mylibrary/www
+
+=head1 DESCRIPTION
+
+Process all biblio records from a Koha instance and generate Sitemap files
+complying with this protocol as described on L<http://sitemaps.org>. The goal of
+this script is to be able to provide to search engines direct access to biblio
+records. It avoid leaving search engine browsing Koha OPAC and so generating
+a lot of traffic, and workload, for a bad result.
+
+A file name F<sitemapindex.xml> is generated. It contains references to Sitemap
+multiples files. Each file contains at most 50,000 urls, and is named
+F<sitemapXXXX.xml>.
+
+The files must be stored on Koha OPAC root directory, ie
+F<<koha-root>/koha-tmpl/>. Place also in this directory a F<robots.txt> file
+like this one:
+
+ Sitemap: sitemapindex.xml
+ User-agent: *
+ Disallow: /cgi-bin/
+
+=head1 PARAMETERS
+
+=over
+
+=item B<--url=Koha OPAC base URL>
+
+If omitted, OPACBaseURL syspref is used.
+
+=item B<--short|noshort>
+
+By default, --short. With --short, URL to bib record ends with
+/bib/biblionumber. With --noshort, URL ends with
+/cgi-bin/koha/opac-detail.pl?biblionumber=bibnum
+
+=item B<--dir>
+
+Directory where to write sitemap files. By default, the current directory.
+
+=item B<--verbose|-v>
+
+Enable script verbose mode: a message is displayed for each 10,000 biblio
+records processed.
+
+=item B<--help|-h>
+
+Print this help page.
+
+=back
+
+=cut
--- /dev/null
+#!/usr/bin/perl
+
+# Copyright 2015 Tamil s.a.r.l.
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 3 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Koha; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+use Modern::Perl;
+use Test::MockModule;
+use File::Basename;
+use File::Path;
+use DateTime;
+use Test::More tests => 14;
+
+
+BEGIN {
+ use_ok('Koha::Sitemapper');
+ use_ok('Koha::Sitemapper::Writer');
+}
+
+
+sub slurp {
+ my $file = shift;
+ open my $fh, '<', $file or die;
+ local $/ = undef;
+ my $cont = <$fh>;
+ close $fh;
+ return $cont;
+}
+
+
+# Create 3 mocked dataset to be used by Koha::Sitemaper in place of DB content
+my $module_context = new Test::MockModule('C4::Context');
+$module_context->mock('_new_dbh', sub {
+ my $dbh = DBI->connect( 'DBI:Mock:', '', '' )
+ || die "Cannot create handle: $DBI::errstr\n";
+ return $dbh
+});
+my $dbh = C4::Context->dbh();
+my $two_bibs = [
+ [ qw/ biblionumber timestamp / ],
+ [ qw/ 1234 2013-11-15 / ],
+ [ qw/ 9875 2015-08-31 / ],
+];
+my $lotof_bibs = [ [ qw/ biblionumber timestamp / ] ];
+push @$lotof_bibs, [ $_, '2015-08-31' ] for 1..75000;
+$dbh->{mock_add_resultset} = $two_bibs;
+$dbh->{mock_add_resultset} = $two_bibs;
+$dbh->{mock_add_resultset} = $lotof_bibs;
+
+my $dir = File::Spec->rel2abs( dirname(__FILE__) );
+
+# Create a sitemap for a catalog containg 2 biblios, with option 'long url'
+my $sitemaper = Koha::Sitemapper->new(
+ verbose => 0,
+ url => 'http://www.mylibrary.org',
+ dir => $dir,
+ short => 0,
+);
+$sitemaper->run();
+
+my $file = "$dir/sitemapindex.xml";
+ok( -e "$dir/sitemapindex.xml", "File sitemapindex.xml created");
+my $file_content = slurp($file);
+my $now = DateTime->now->ymd;
+my $expected_content = <<EOS;
+<?xml version="1.0" encoding="UTF-8"?>
+
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+ <sitemap>
+ <loc>http://www.mylibrary.org/sitemap0001.xml</loc>
+ <lastmod>$now</lastmod>
+ </sitemap>
+</sitemapindex>
+EOS
+chop $expected_content;
+ok( $file_content eq $expected_content, "Its content is valid" );
+
+$file = "$dir/sitemap0001.xml";
+ok( -e $file, "File sitemap0001.xml created");
+$file_content = slurp($file);
+$expected_content = <<EOS;
+<?xml version="1.0" encoding="UTF-8"?>
+
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+ <url>
+ <loc>http://www.mylibrary.org/cgi-bin/koha/opac-detail.pl?biblionumber=1234</loc>
+ <lastmod>2013-11-15</lastmod>
+ </url>
+ <url>
+ <loc>http://www.mylibrary.org/cgi-bin/koha/opac-detail.pl?biblionumber=9875</loc>
+ <lastmod>2015-08-31</lastmod>
+ </url>
+</urlset>
+EOS
+ok( $file_content eq $expected_content, "Its content is valid" );
+
+
+# Create a sitemap for a catalog containg 2 biblios, with option 'short url'.
+# Test that 2 files are created.
+$sitemaper = Koha::Sitemapper->new(
+ verbose => 0,
+ url => 'http://www.mylibrary.org',
+ dir => $dir,
+ short => 1,
+);
+$sitemaper->run();
+
+$file = "$dir/sitemap0001.xml";
+ok( -e $file, "File sitemap0001.xml with short URLs created");
+$file_content = slurp($file);
+$expected_content = <<EOS;
+<?xml version="1.0" encoding="UTF-8"?>
+
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+ <url>
+ <loc>http://www.mylibrary.org/bib/1234</loc>
+ <lastmod>2013-11-15</lastmod>
+ </url>
+ <url>
+ <loc>http://www.mylibrary.org/bib/9875</loc>
+ <lastmod>2015-08-31</lastmod>
+ </url>
+</urlset>
+EOS
+ok( $file_content eq $expected_content, "Its content is valid" );
+
+
+# Create a sitemap for a catalog containing 75000 biblios, with option 'short
+# url'. Test that 3 files are created: index file + 2 urls file with
+# respectively 50000 et 25000 urls.
+$sitemaper = Koha::Sitemapper->new(
+ verbose => 0,
+ url => 'http://www.mylibrary.org',
+ dir => $dir,
+ short => 1,
+);
+$sitemaper->run();
+
+$file = "$dir/sitemapindex.xml";
+ok( -e "$dir/sitemapindex.xml", "File sitemapindex.xml for 75000 bibs created");
+$file_content = slurp($file);
+$expected_content = <<EOS;
+<?xml version="1.0" encoding="UTF-8"?>
+
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+ <sitemap>
+ <loc>http://www.mylibrary.org/sitemap0001.xml</loc>
+ <lastmod>$now</lastmod>
+ </sitemap>
+ <sitemap>
+ <loc>http://www.mylibrary.org/sitemap0002.xml</loc>
+ <lastmod>$now</lastmod>
+ </sitemap>
+</sitemapindex>
+EOS
+chop $expected_content;
+ok( $file_content eq $expected_content, "Its content is valid" );
+
+$file = "$dir/sitemap0001.xml";
+ok( -e $file, "File sitemap0001.xml created");
+
+open my $fh, "<", $file;
+my $count = 0;
+while (<$fh>) {
+ $count++ if /<loc>/;
+}
+ok ( $count == 50000, "It contains 50000 URLs");
+
+$file = "$dir/sitemap0002.xml";
+ok( -e $file, "File sitemap0002.xml created");
+
+open $fh, "<", $file;
+$count = 0;
+while (<$fh>) {
+ $count++ if /<loc>/;
+}
+ok ( $count == 25000, "It contains 25000 URLs");
+
+# Cleanup
+unlink "$dir/$_" for qw / sitemapindex.xml sitemap0001.xml sitemap0002.xml /;