Merge commit 'kc/master'

[koha_fer] / C4 / Tags.pm
diff --git a/C4/Tags.pm b/C4/Tags.pm

index 0eebb0c..a260383 100644 (file)
--- a/C4/Tags.pm
+++ b/C4/Tags.pm
@@ -26,7 +26,7 @@ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  use vars qw($ext_dict $select_all @fields);
  
  BEGIN {
-       $VERSION = 0.01;
+       $VERSION = 0.03;
         @ISA = qw(Exporter);
         @EXPORT_OK = qw(
                 &get_tag &get_tags &get_tag_rows
@@ -35,6 +35,12 @@ BEGIN {
                 &remove_tag
                 &delete_tag_rows_by_ids
                 &rectify_weights
+               &get_approval_rows
+               &blacklist
+               &whitelist
+               &is_approved
+               &approval_counts
+               &get_filters
         );
         # %EXPORT_TAGS = ();
         $ext_dict = C4::Context->preference('TagsExternalDictionary');
@@ -45,7 +51,7 @@ BEGIN {
         }
         if ($ext_dict) {
                 require Lingua::Ispell;
-               import Lingua::Ispell qw(spellcheck);
+               import Lingua::Ispell qw(spellcheck add_word_lc save_dictionary);
         }
  }
  
@@ -56,10 +62,44 @@ INIT {
         $select_all = "SELECT " . join(',',@fields) . "\n FROM   tags_all\n";
  }
  
-sub remove_tag ($) {
-       my $tag_id = shift;
-       my $rows = get_tag_rows({tag_id=>$tag_id}) or return 0;
-       (scalar(@$rows) == 1) or return undef;
+sub get_filters (;$) {
+       my $query = "SELECT * FROM tags_filters ";
+       my ($sth);
+       if (@_) {
+               $sth = C4::Context->dbh->prepare($query . " WHERE filter_id = ? ");
+               $sth->execute(shift);
+       } else {
+               $sth = C4::Context->dbh->prepare($query);
+               $sth->execute;
+       }
+       return $sth->fetchall_arrayref({});
+}
+
+#      (SELECT count(*) FROM tags_all     ) as tags_all,
+#      (SELECT count(*) FROM tags_index   ) as tags_index,
+
+sub approval_counts () { 
+       my $query = "SELECT
+               (SELECT count(*) FROM tags_approval WHERE approved= 1) as approved_count,
+               (SELECT count(*) FROM tags_approval WHERE approved=-1) as rejected_count,
+               (SELECT count(*) FROM tags_approval WHERE approved= 0) as unapproved_count
+       ";
+       my $sth = C4::Context->dbh->prepare($query);
+       $sth->execute;
+       my $result = $sth->fetchrow_hashref();
+       $result->{approved_total} = $result->{approved_count} + $result->{rejected_count} + $result->{unapproved_count};
+       $debug and warn "counts returned: " . Dumper $result;
+       return $result;
+}
+
+sub remove_tag ($;$) {
+       my $tag_id  = shift or return undef;
+       my $user_id = (@_) ? shift : undef;
+       my $rows = (defined $user_id) ?
+                       get_tag_rows({tag_id=>$tag_id, borrowernumber=>$user_id}) :
+                       get_tag_rows({tag_id=>$tag_id}) ;
+       $rows or return 0;
+       (scalar(@$rows) == 1) or return undef;  # should never happen (duplicate ids)
         my $row = shift(@$rows);
         ($tag_id == $row->{tag_id}) or return 0;
         my $tags = get_tags({term=>$row->{term}, biblionumber=>$row->{biblionumber}});
@@ -122,13 +162,13 @@ sub get_tag_rows ($) {
                         carp "Empty argument key to get_tag_rows: ignoring!";
                         next;
                 }
-               unless (1 == scalar grep {/^ $key $/xi} @ok_fields) {
+               unless (1 == scalar grep {/^ $key $/x} @ok_fields) {
                         carp "get_tag_rows received unreconized argument key '$key'.";
                         next;
                 }
-               if ($key =~ /^limit$/i) {
+               if ($key eq 'limit') {
                         my $val = $hash->{$key};
-                       unless ($val =~ /^\d+$/) {
+                       unless ($val =~ /^(\d+,)?\d+$/) {
                                 carp "Non-nuerical limit value '$val' ignored!";
                                 next;
                         }
@@ -151,9 +191,8 @@ sub get_tag_rows ($) {
  }
  
  sub get_tags (;$) {            # i.e., from tags_index
-       # my $self = shift;
         my $hash = shift || {};
-       my @ok_fields = qw(term biblionumber weight limit sort);
+       my @ok_fields = qw(term biblionumber weight limit sort approved);
         my $wheres;
         my $limit  = "";
         my $order  = "";
@@ -164,18 +203,18 @@ sub get_tags (;$) {               # i.e., from tags_index
                         carp "Empty argument key to get_tags: ignoring!";
                         next;
                 }
-               unless (1 == scalar grep {/^ $key $/xi} @ok_fields) {
+               unless (1 == scalar grep {/^ $key $/x} @ok_fields) {
                         carp "get_tags received unreconized argument key '$key'.";
                         next;
                 }
-               if ($key =~ /^limit$/i) {
+               if ($key eq 'limit') {
                         my $val = $hash->{$key};
-                       unless ($val =~ /^\d+$/) {
+                       unless ($val =~ /^(\d+,)?\d+$/) {
                                 carp "Non-nuerical limit value '$val' ignored!";
                                 next;
                         }
                         $limit = " LIMIT $val\n";
-               } elsif ($key =~ /^sort$/i) {
+               } elsif ($key eq 'sort') {
                         foreach my $by (split /\,/, $hash->{$key}) {
                                 unless (
                                         $by =~ /^([-+])?(term)/ or
@@ -185,14 +224,22 @@ sub get_tags (;$) {               # i.e., from tags_index
                                         carp "get_tags received illegal sort order '$by'";
                                         next;
                                 }
-                               $order .= " ORDER BY $2 " . ($1 eq '-' ? 'DESC' : $1 eq '+' ? 'ASC' : '') . "\n";
+                               if ($order) {
+                                       $order .= ", ";
+                               } else {
+                                       $order = " ORDER BY ";
+                               }
+                               $order .= $2 . " " . ((!$1) ? '' : $1 eq '-' ? 'DESC' : $1 eq '+' ? 'ASC' : '') . "\n";
                         }
                         
                 } else {
-                       my $whereval = $key;
-                       ($key =~ /^term$/i) and $whereval = 'tags_index.term';
-                       $wheres .= ($wheres) ? " AND    $whereval = ?\n" : " WHERE  $whereval = ?\n";
-                       push @exe_args, $hash->{$key};
+                       my $whereval = $hash->{$key};
+                       my $longkey = ($key eq 'term'    ) ? 'tags_index.term'        :
+                                                 ($key eq 'approved') ? 'tags_approval.approved' : $key;
+                       my $op = ($whereval =~ s/^(>=|<=)// or
+                                         $whereval =~ s/^(>|=|<)//   ) ? $1 : '=';
+                       $wheres .= ($wheres) ? " AND    $longkey $op ?\n" : " WHERE  $longkey $op ?\n";
+                       push @exe_args, $whereval;
                 }
         }
         my $query = "
@@ -212,15 +259,90 @@ sub get_tags (;$) {               # i.e., from tags_index
         return $sth->fetchall_arrayref({});
  }
  
+sub get_approval_rows (;$) {           # i.e., from tags_approval
+       my $hash = shift || {};
+       my @ok_fields = qw(term approved date_approved approved_by weight_total limit sort borrowernumber);
+       my $wheres;
+       my $limit  = "";
+       my $order  = "";
+       my @exe_args = ();
+       foreach my $key (keys %$hash) {
+               $debug and print STDERR "get_approval_rows arg. '$key' = ", $hash->{$key}, "\n";
+               unless (length $key) {
+                       carp "Empty argument key to get_approval_rows: ignoring!";
+                       next;
+               }
+               unless (1 == scalar grep {/^ $key $/x} @ok_fields) {
+                       carp "get_approval_rows received unreconized argument key '$key'.";
+                       next;
+               }
+               if ($key eq 'limit') {
+                       my $val = $hash->{$key};
+                       unless ($val =~ /^(\d+,)?\d+$/) {
+                               carp "Non-numerical limit value '$val' ignored!";
+                               next;
+                       }
+                       $limit = " LIMIT $val\n";
+               } elsif ($key eq 'sort') {
+                       foreach my $by (split /\,/, $hash->{$key}) {
+                               unless (
+                                       $by =~ /^([-+])?(term)/            or
+                                       $by =~ /^([-+])?(biblionumber)/    or
+                    $by =~ /^([-+])?(borrowernumber)/  or
+                                       $by =~ /^([-+])?(weight_total)/    or
+                                       $by =~ /^([-+])?(approved(_by)?)/  or
+                                       $by =~ /^([-+])?(date_approved)/
+                               ) {
+                                       carp "get_approval_rows received illegal sort order '$by'";
+                                       next;
+                               }
+                               if ($order) {
+                                       $order .= ", ";
+                               } else {
+                                       $order = " ORDER BY " unless $order;
+                               }
+                               $order .= $2 . " " . ((!$1) ? '' : $1 eq '-' ? 'DESC' : $1 eq '+' ? 'ASC' : '') . "\n";
+                       }
+                       
+               } else {
+                       my $whereval = $hash->{$key};
+                       my $op = ($whereval =~ s/^(>=|<=)// or
+                                         $whereval =~ s/^(>|=|<)//   ) ? $1 : '=';
+                       $wheres .= ($wheres) ? " AND    $key $op ?\n" : " WHERE  $key $op ?\n";
+                       push @exe_args, $whereval;
+               }
+       }
+       my $query = "
+       SELECT  tags_approval.term          AS term,
+                       tags_approval.approved      AS approved,
+                       tags_approval.date_approved AS date_approved,
+                       tags_approval.approved_by   AS approved_by,
+                       tags_approval.weight_total  AS weight_total,
+                       CONCAT(borrowers.surname, ', ', borrowers.firstname) AS approved_by_name
+       FROM    tags_approval
+       LEFT JOIN borrowers
+       ON      tags_approval.approved_by = borrowers.borrowernumber ";
+       $query .= ($wheres||'') . $order . $limit;
+       $debug and print STDERR "get_approval_rows query:\n $query\n",
+                                                       "get_approval_rows query args: ", join(',', @exe_args), "\n";
+       my $sth = C4::Context->dbh->prepare($query);
+       if (@exe_args) {
+               $sth->execute(@exe_args);
+       } else {
+               $sth->execute;
+       }
+       return $sth->fetchall_arrayref({});
+}
+
  sub is_approved ($) {
         my $term = shift or return undef;
-       if ($ext_dict) {
-               return (spellcheck($term) ? 0 : 1);
-       }
         my $sth = C4::Context->dbh->prepare("SELECT approved FROM tags_approval WHERE term = ?");
         $sth->execute($term);
-       $sth->rows or return undef;
-       return $sth->fetch;
+       unless ($sth->rows) {
+               $ext_dict and return (spellcheck($term) ? 0 : 1);       # spellcheck returns empty on OK word
+               return 0;
+       }
+       return $sth->fetchrow;
  }
  
  sub get_tag_index ($;$) {
@@ -236,27 +358,93 @@ sub get_tag_index ($;$) {
         return $sth->fetchrow_hashref;
  }
  
-sub add_tag_approval ($;$) {
+sub whitelist {
+       my $operator = shift;
+       defined $operator or return undef; # have to test defined to allow =0 (kohaadmin)
+       if ($ext_dict) {
+               foreach (@_) {
+                       spellcheck($_) or next;
+                       add_word_lc($_);
+               }
+       }
+       foreach (@_) {
+               my $aref = get_approval_rows({term=>$_});
+               if ($aref and scalar @$aref) {
+                       mod_tag_approval($operator,$_,1);
+               } else {
+                       add_tag_approval($_,$operator);
+               }
+       }
+       return scalar @_;
+}
+# note: there is no "unwhitelist" operation because there is no remove for Ispell.
+# The blacklist regexps should operate "in front of" the whitelist, so if you approve
+# a term mistakenly, you can still reverse it. But there is no going back to "neutral".
+sub blacklist {
+       my $operator = shift;
+       defined $operator or return undef; # have to test defined to allow =0 (kohaadmin)
+       foreach (@_) {
+               my $aref = get_approval_rows({term=>$_});
+               if ($aref and scalar @$aref) {
+                       mod_tag_approval($operator,$_,-1);
+               } else {
+                       add_tag_approval($_,$operator,-1);
+               }
+       }
+       return scalar @_;
+}
+sub add_filter {
+       my $operator = shift;
+       defined $operator or return undef; # have to test defined to allow =0 (kohaadmin)
+       my $query = "INSERT INTO tags_blacklist (regexp,y,z) VALUES (?,?,?)";
+       # my $sth = C4::Context->dbh->prepare($query);
+       return scalar @_;
+}
+sub remove_filter {
+       my $operator = shift;
+       defined $operator or return undef; # have to test defined to allow =0 (kohaadmin)
+       my $query = "REMOVE FROM tags_blacklist WHERE blacklist_id = ?";
+       # my $sth = C4::Context->dbh->prepare($query);
+       # $sth->execute($term);
+       return scalar @_;
+}
+
+sub add_tag_approval ($;$$) {  # or disapproval
+       $debug and warn "add_tag_approval(" . join(", ",map {defined($_) ? $_ : 'UNDEF'} @_) . ")";
         my $term = shift or return undef;
         my $query = "SELECT * FROM tags_approval WHERE term = ?";
         my $sth = C4::Context->dbh->prepare($query);
         $sth->execute($term);
         ($sth->rows) and return increment_weight_total($term);
-       my $ok = (@_ ? shift : 0);
-       if ($ok) {
-               $query = "INSERT INTO tags_approval (term,approved_by,approved,date_approved) VALUES (?,?,1,NOW())";
-               $debug and print STDERR "add_tag_approval query:\n$query\nadd_tag_approval args: ($term,$ok)\n";
-               $sth = C4::Context->dbh->prepare($query);
-               $sth->execute($term,$ok);
+       my $operator = shift || 0;
+       my $approval = (@_ ? shift : 0);        # default is unapproved
+       my @exe_args = ($term);         # all 3 queries will use this argument
+       if ($operator) {
+               $query = "INSERT INTO tags_approval (term,approved_by,approved,date_approved) VALUES (?,?,?,NOW())";
+               push @exe_args, $operator, $approval;
+       } elsif ($approval) {
+               $query = "INSERT INTO tags_approval (term,approved,date_approved) VALUES (?,?,NOW())";
+               push @exe_args, $approval;
         } else {
                 $query = "INSERT INTO tags_approval (term,date_approved) VALUES (?,NOW())";
-               $debug and print STDERR "add_tag_approval query:\n$query\nadd_tag_approval args: ($term)\n";
-               $sth = C4::Context->dbh->prepare($query);
-               $sth->execute($term);
         }
+       $debug and print STDERR "add_tag_approval query: $query\nadd_tag_approval args: (" . join(", ", @exe_args) . ")\n";
+       $sth = C4::Context->dbh->prepare($query);
+       $sth->execute(@exe_args);
         return $sth->rows;
  }
  
+sub mod_tag_approval ($$$) {
+       my $operator = shift;
+       defined $operator or return undef; # have to test defined to allow =0 (kohaadmin)
+       my $term     = shift or return undef;
+       my $approval = (scalar @_ ? shift : 1); # default is to approve
+       my $query = "UPDATE tags_approval SET approved_by=?, approved=?, date_approved=NOW() WHERE term = ?";
+       $debug and print STDERR "mod_tag_approval query: $query\nmod_tag_approval args: ($operator,$approval,$term)\n";
+       my $sth = C4::Context->dbh->prepare($query);
+       $sth->execute($operator,$approval,$term);
+}
+
  sub add_tag_index ($$;$) {
         my $term         = shift or return undef;
         my $biblionumber = shift or return undef;
@@ -265,7 +453,7 @@ sub add_tag_index ($$;$) {
         $sth->execute($term,$biblionumber);
         ($sth->rows) and return increment_weight($term,$biblionumber);
         $query = "INSERT INTO tags_index (term,biblionumber) VALUES (?,?)";
-       $debug and print "add_tag_index query:\n$query\nadd_tag_index args: ($term,$biblionumber)\n";
+       $debug and print STDERR "add_tag_index query: $query\nadd_tag_index args: ($term,$biblionumber)\n";
         $sth = C4::Context->dbh->prepare($query);
         $sth->execute($term,$biblionumber);
         return $sth->rows;
@@ -311,7 +499,7 @@ sub increment_weights ($$) {
  }
  sub decrement_weights ($$) {
         decrement_weight(@_);
-       derement_weight_total(shift);
+       decrement_weight_total(shift);
  }
  sub increment_weight_total ($) {
         _set_weight_total('weight_total+1',shift);
@@ -330,7 +518,7 @@ sub _set_weight_total ($$) {
         UPDATE tags_approval
         SET    weight_total=" . (shift) . "
         WHERE  term=?
-       ");
+       ");                                             # note: CANNOT use "?" for weight_total (see the args above).
         $sth->execute(shift);   # just the term
  }
  sub _set_weight ($$$) {
@@ -348,25 +536,35 @@ sub add_tag ($$;$$) {     # biblionumber,term,[borrowernumber,approvernumber]
         my $biblionumber = shift or return undef;
         my $term         = shift or return undef;
         my $borrowernumber = (@_) ? shift : 0;          # the user, default to kohaadmin
-
-       # first, add to tags regardless of approaval
+       $term =~ s/^\s+//;
+       $term =~ s/\s+$//;
+       ($term) or return undef;        # must be more than whitespace
+       my $rows = get_tag_rows({biblionumber=>$biblionumber, borrowernumber=>$borrowernumber, term=>$term, limit=>1});
         my $query = "INSERT INTO tags_all
         (borrowernumber,biblionumber,term,date_created)
         VALUES (?,?,?,NOW())";
-       $debug and print STDERR "add_tag query:\n $query\n",
+       $debug and print STDERR "add_tag query: $query\n",
                                                         "add_tag query args: ($borrowernumber,$biblionumber,$term)\n";
+       if (scalar @$rows) {
+               $debug and carp "Duplicate tag detected.  Tag not added.";      
+               return undef;
+       }
+       # add to tags_all regardless of approaval
         my $sth = C4::Context->dbh->prepare($query);
         $sth->execute($borrowernumber,$biblionumber,$term);
  
         # then 
-       if (@_) {       # if an arg remains, it is the borrowernumber of the approver: tag is pre-approved.
+       if (scalar @_) {        # if arg remains, it is the borrowernumber of the approver: tag is pre-approved.
                 my $approver = shift;
-               add_tag_approval($term,$approver);
+               $debug and print STDERR "term '$term' pre-approved by borrower #$approver\n";
+               add_tag_approval($term,$approver,1);
                 add_tag_index($term,$biblionumber,$approver);
-       } elsif (is_approved($term)) {
-               add_tag_approval($term,1);
+       } elsif (is_approved($term) >= 1) {
+               $debug and print STDERR "term '$term' approved by whitelist\n";
+               add_tag_approval($term,0,1);
                 add_tag_index($term,$biblionumber,1);
         } else {
+               $debug and print STDERR "term '$term' NOT approved (yet)\n";
                 add_tag_approval($term);
                 add_tag_index($term,$biblionumber);
         }
@@ -383,6 +581,80 @@ More verose debugging messages are sent in the presence of non-zero $ENV{"DEBUG"
  
  =head3 TO DO: Add real perldoc
  
+=cut
+
+=head2 External Dictionary (Ispell) [Recommended]
+
+An external dictionary can be used as a means of "pre-populating" and tracking
+allowed terms based on the widely available Ispell dictionary.  This can be the system
+dictionary or a personal version, but in order to support whitelisting, it must be
+editable to the process running Koha.  
+
+To enable, enter the absolute path to the ispell dictionary in the system
+preference "TagsExternalDictionary".
+
+Using external Ispell is recommended for both ease of use and performance.  Note that any
+language version of Ispell can be installed.  It is also possible to modify the dictionary 
+at the command line to affect the desired content.
+
+WARNING: The default Ispell dictionary includes (properly spelled) obscenities!  Users 
+should build their own wordlist and recompile Ispell based on it.  See man ispell for 
+instructions.
+
+=head2 Table Structure
+
+The tables used by tags are:
+       tags_all
+       tags_index
+       tags_approval
+       tags_blacklist
+
+Your first thought may be that this looks a little complicated.  It is, but only because
+it has to be.  I'll try to explain.
+
+tags_all - This table would be all we really need if we didn't care about moderation or
+performance or tags disappearing when borrowers are removed.  Too bad, we do.  Otherwise
+though, it contains all the relevant info about a given tag:
+       tag_id         - unique id number for it
+       borrowernumber - user that entered it
+       biblionumber   - book record it is attached to
+       term           - tag "term" itself
+       language       - perhaps used later to influence weighting
+       date_created   - date and time it was created
+
+tags_approval - Since we need to provide moderation, this table is used to track it.  If no
+external dictionary is used, this table is the sole reference for approval and rejection.
+With an external dictionary, it tracks pending terms and past whitelist/blacklist actions.
+This could be called an "approved terms" table.  See above regarding the External Dictionary.
+       term           - tag "term" itself 
+       approved       - Negative, 0 or positive if tag is rejected, pending or approved.
+       date_approved  - date of last action
+       approved_by    - staffer performing the last action
+       weight_total   - total occurance of term in any biblio by any users
+
+tags_index - This table is for performance, because by far the most common operation will 
+be fetching tags for a list of search results.  We will have a set of biblios, and we will
+want ONLY their approved tags and overall weighting.  While we could implement a query that
+would traverse tags_all filtered against tags_approval, the performance implications of
+trying to calculate that and the "weight" (number of times a tag appears) on the fly are drastic.
+       term           - approved term as it appears in tags_approval
+       biblionumber   - book record it is attached to
+       weight         - number of times tag applied by any user
+
+tags_blacklist - A set of regular expression filters.  Unsurprisingly, these should be perl-
+compatible (PCRE) for your version of perl.  Since this is a blacklist, a term will be
+blocked if it matches any of the given patterns.  WARNING: do not add blacklist regexps
+if you do not understand their operation and interaction.  It is quite easy to define too
+simple or too complex a regexp and effectively block all terms.  The blacklist operation is 
+fairly resource intensive, since every line of tags_blacklist will need to be read and compared.
+It is recommended that tags_blacklist be used minimally, and only by an administrator with an
+understanding of regular expression syntax and performance.
+
+So the best way to think about the different tables is that they are each tailored to a certain
+use.  Note that tags_approval and tags_index do not rely on the user's borrower mapping, so
+the tag population can continue to grow even if a user (along with their corresponding
+rows in tags_all) is removed.  
+
  =head2 Tricks
  
  If you want to auto-populate some tags for debugging, do something like this:
@@ -420,8 +692,10 @@ mysql> select biblionumber from biblio where title LIKE "%Health%";
  +--------------+
  26 rows in set (0.00 sec)
  
-Then, take those numbers and type them into this perl command line:
+Then, take those numbers and type/pipe them into this perl command line:
  perl -ne 'use C4::Tags qw(get_tags add_tag); use Data::Dumper;chomp; add_tag($_,"health",51,1); print Dumper get_tags({limit=>5,term=>"health",});'
  
+Note, the borrowernumber in this example is 51.  Use your own or any arbitrary valid borrowernumber.
+
  =cut