Syndetics: start switch to XML::LibXML to parse results
[srvgit] / C4 / External / Syndetics.pm
index ff4fb5b..f61a31c 100644 (file)
@@ -18,6 +18,7 @@ package C4::External::Syndetics;
 # Suite 330, Boston, MA  02111-1307 USA
 
 use XML::Simple;
+use XML::LibXML;
 use LWP::Simple;
 use LWP::UserAgent;
 use HTTP::Request::Common;
@@ -42,6 +43,9 @@ BEGIN {
     );
 }
 
+# package-level variable
+my $parser = XML::LibXML->new();
+
 =head1 NAME
 
 C4::External::Syndetics - Functions for retrieving Syndetics content in Koha
@@ -68,7 +72,7 @@ sub get_syndetics_index {
     # grab the AWSAccessKeyId: mine is '0V5RRRRJZ3HR2RQFNHR2'
     my $syndetics_client_code = C4::Context->preference('SyndeticsClientCode');
 
-    my $url = "http://syndetics.com/index.aspx?isbn=$isbn/INDEX.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
+    my $url = "http://www.syndetics.com/index.aspx?isbn=$isbn/INDEX.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
 
     my $ua = LWP::UserAgent->new;
     $ua->timeout(10);
@@ -101,7 +105,7 @@ sub get_syndetics_summary {
     # grab the AWSAccessKeyId: mine is '0V5RRRRJZ3HR2RQFNHR2'
     my $syndetics_client_code = C4::Context->preference('SyndeticsClientCode');
 
-    my $url = "http://syndetics.com/index.aspx?isbn=$isbn/SUMMARY.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
+    my $url = "http://www.syndetics.com/index.aspx?isbn=$isbn/SUMMARY.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
     my $ua = LWP::UserAgent->new;
     $ua->timeout(10);
     $ua->env_proxy;
@@ -130,7 +134,7 @@ sub get_syndetics_toc {
     # grab the AWSAccessKeyId: mine is '0V5RRRRJZ3HR2RQFNHR2'
     my $syndetics_client_code = C4::Context->preference('SyndeticsClientCode');
 
-    my $url = "http://syndetics.com/index.aspx?isbn=$isbn/TOC.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
+    my $url = "http://www.syndetics.com/index.aspx?isbn=$isbn/TOC.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
     my $ua = LWP::UserAgent->new;
     $ua->timeout(10);
     $ua->env_proxy;
@@ -159,7 +163,7 @@ sub get_syndetics_excerpt {
     # grab the AWSAccessKeyId: mine is '0V5RRRRJZ3HR2RQFNHR2'
     my $syndetics_client_code = C4::Context->preference('SyndeticsClientCode');
 
-    my $url = "http://syndetics.com/index.aspx?isbn=$isbn/DBCHAPTER.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
+    my $url = "http://www.syndetics.com/index.aspx?isbn=$isbn/DBCHAPTER.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
     my $ua = LWP::UserAgent->new;
     $ua->timeout(10);
     $ua->env_proxy;
@@ -206,7 +210,7 @@ sub get_syndetics_reviews {
             #warn "Skipping $source->{element} doesn't match $syndetics_elements->{$source->{element}} \n";
             next;
         }
-        my $url = "http://syndetics.com/index.aspx?isbn=$isbn/$source->{file}&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
+        my $url = "http://www.syndetics.com/index.aspx?isbn=$isbn/$source->{file}&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
 
         my $ua = LWP::UserAgent->new;
         $ua->timeout(10);
@@ -219,27 +223,21 @@ sub get_syndetics_reviews {
 
         my $content = $response->content;
         warn "could not retrieve $url" unless $content;
-        my $xmlsimple = XML::Simple->new();
-        eval {
-        $response = $xmlsimple->XMLin(
-            $content,
-            ForceContent => 1,
-            forcearray => [ qw(Fld520) ]
-        ) unless !$content;
+       
+        eval { 
+            my $doc = $parser->parse_string($content);
+
+            # note that using findvalue strips any HTML elements embedded
+            # in that review.  That helps us handle slight differences
+            # in the output provided by Syndetics 'old' and 'new' versions
+            # of their service and cleans any questionable HTML that
+            # may be present in the reviews, but does mean that any
+            # <B> and <I> tags used to format the review are also gone.
+            my $result = $doc->findvalue('//Fld520');
+            push @reviews, {title => $source->{title}, reviews => [ { content => $result } ]} if $result;
         };
-            
-        for my $subfield_a (@{$response->{VarFlds}->{VarDFlds}->{Notes}->{Fld520}}) {
-            my @content;
-            # this is absurd, but sometimes this data serializes differently
-            if(ref($subfield_a->{a}->{content}) eq 'ARRAY') {
-                for my $content (@{$subfield_a->{a}->{content}}) {
-                    push @content, {content => $content};
-                }
-            }
-            else {
-                push @content, {content => $subfield_a->{a}->{content}};
-            }
-            push @reviews, {title => $source->{title}, reviews => \@content};
+        if ($@) {
+            warn "Error parsing response from $url";
         }
     }
     return \@reviews;
@@ -251,7 +249,7 @@ sub get_syndetics_editions {
     # grab the AWSAccessKeyId: mine is '0V5RRRRJZ3HR2RQFNHR2'
     my $syndetics_client_code = C4::Context->preference('SyndeticsClientCode');
 
-    my $url = "http://syndetics.com/index.aspx?isbn=$isbn/FICTION.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
+    my $url = "http://www.syndetics.com/index.aspx?isbn=$isbn/FICTION.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
     my $ua = LWP::UserAgent->new;
     $ua->timeout(10);
     $ua->env_proxy;
@@ -281,7 +279,7 @@ sub get_syndetics_anotes {
     # grab the AWSAccessKeyId: mine is '0V5RRRRJZ3HR2RQFNHR2'
     my $syndetics_client_code = C4::Context->preference('SyndeticsClientCode');
 
-    my $url = "http://syndetics.com/index.aspx?isbn=$isbn/ANOTES.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
+    my $url = "http://www.syndetics.com/index.aspx?isbn=$isbn/ANOTES.XML&client=$syndetics_client_code&type=xw10&upc=$upc&oclc=$oclc";
     my $ua = LWP::UserAgent->new;
     $ua->timeout(10);
     $ua->env_proxy;