4 #   FILE:  SPT--Recommender.php 
    6 #   Part of the Collection Workflow Integration System (CWIS) 
    7 #   Copyright 2004-2013 Edward Almasy and Internet Scout Research Group 
    8 #   http://scout.wisc.edu/cwis/ 
   13     # ---- PUBLIC INTERFACE -------------------------------------------------- 
   14     # define content field types 
   26         # set default parameters 
   27         $this->ContentCorrelationThreshold = 1;
 
   29         # save database object 
   32         # save new configuration values 
   40         # set default debug state 
   44     # set level for debugging output 
   51     # ---- recommendation methods 
   53     # recommend items for specified user 
   54     function Recommend($UserId, $StartingResult = 0, $NumberOfResults = 10)
 
   56         if ($this->
DebugLevel > 0) {  print(
"REC:  Recommend(${UserId}, ${StartingResult}, ${NumberOfResults})<br>\n");  }
 
   58         # load in user ratings 
   61         $DB->Query(
"SELECT ".$this->ItemIdFieldName.
", ".$this->RatingFieldName
 
   62                 .
" FROM ".$this->RatingTableName
 
   63                 .
" WHERE ".$this->UserIdFieldName.
" = ${UserId}");
 
   64         while ($Row = 
$DB->FetchRow())
 
   69         if ($this->
DebugLevel > 1) {  print(
"REC:  user has rated ".count($Ratings).
" items<br>\n");  }
 
   71         # for each item that user has rated 
   73         foreach ($Ratings as $ItemId => $ItemRating)
 
   75             # for each content correlation available for that item 
   76             $DB->Query(
"SELECT Correlation, ItemIdB " 
   77                     .
"FROM RecContentCorrelations " 
   78                     .
"WHERE ItemIdA = ${ItemId}");
 
   79             while ($Row = 
$DB->FetchRow())
 
   81                 # multiply that correlation by normalized rating and add 
   82                 #       resulting value to recommendation value for that item 
   83                 if (isset($RecVals[$Row[
"ItemIdB"]]))
 
   85                     $RecVals[$Row[
"ItemIdB"]] +=
 
   86                             $Row[
"Correlation"] * ($ItemRating - 50);
 
   90                     $RecVals[$Row[
"ItemIdB"]] =
 
   91                             $Row[
"Correlation"] * ($ItemRating - 50);
 
   93                 if ($this->
DebugLevel > 9) {  print(
"REC:  RecVal[".$Row[
"ItemIdB"].
"] = ".$RecVals[$Row[
"ItemIdB"]].
"<br>\n");  }
 
   96         if ($this->
DebugLevel > 1) {  print(
"REC:  found ".count($RecVals).
" total recommendations<br>\n");  }
 
   98         # calculate average correlation between items 
   99         $ResultThreshold = 
$DB->Query(
"SELECT AVG(Correlation) " 
  100                 .
"AS Average FROM RecContentCorrelations", 
"Average");
 
  101         $ResultThreshold = round($ResultThreshold) * 2;
 
  103         # for each recommended item 
  104         foreach ($RecVals as $ItemId => $RecVal)
 
  106             # remove item from list if user already rated it 
  107             if (isset($Ratings[$ItemId]))
 
  109                 unset($RecVals[$ItemId]);
 
  113                 # scale recommendation value back to match thresholds 
  114                 $RecVals[$ItemId] = round($RecVal / 50);
 
  116                 # remove item from recommendation list if value is below threshold 
  117                 if ($RecVals[$ItemId] < $ResultThreshold)
 
  119                     unset($RecVals[$ItemId]);
 
  123         if ($this->
DebugLevel > 1) {  print(
"REC:  found ".count($RecVals).
" positive recommendations<br>\n");  }
 
  125         # sort recommendation list by value 
  126         if (isset($RecVals)) {  arsort($RecVals, SORT_NUMERIC);  }
 
  128         # save total number of results available 
  129         $this->NumberOfResultsAvailable = count($RecVals);
 
  131         # trim result list to match range requested by caller 
  132         $RecValKeys = array_slice(
 
  133                 array_keys($RecVals), $StartingResult, $NumberOfResults);
 
  134         $RecValSegment = array();
 
  135         foreach ($RecValKeys as $Key)
 
  137             $RecValSegment[$Key] = $RecVals[$Key];
 
  140         # return recommendation list to caller 
  141         return $RecValSegment;
 
  144     # add function to be called to filter returned recommendation list 
  147         # save filter function name 
  148         $this->FilterFuncs[] = $FunctionName;
 
  151     # return number of recommendations generated 
  157     # return recommendation generation time 
  163     # return list of items used to generate recommendation of specified item 
  166         # pull list of correlations from DB 
  167         $this->DB->Query(
"SELECT * FROM RecContentCorrelations, ".$this->RatingTableName
 
  168                 .
" WHERE (ItemIdA = ${RecommendedItemId}" 
  169                         .
" OR ItemIdB = ${RecommendedItemId})" 
  170                         .
" AND ".$this->UserIdFieldName.
" = ".$UserId
 
  171                         .
" AND (RecContentCorrelations.ItemIdA = ".$this->RatingTableName.
".".$this->ItemIdFieldName
 
  172                         .
" OR RecContentCorrelations.ItemIdB = ".$this->RatingTableName.
".".$this->ItemIdFieldName.
")" 
  173                         .
" AND Rating >= 50 " 
  174                 .
" ORDER BY Correlation DESC");
 
  176         # for each correlation 
  177         $SourceList = array();
 
  178         while ($Row = $this->DB->FetchRow())
 
  180             # pick out appropriate item ID 
  181             if ($Row[
"ItemIdA"] == $RecommendedItemId)
 
  183                 $ItemId = $Row[
"ItemIdB"];
 
  187                 $ItemId = $Row[
"ItemIdA"];
 
  190             # add item to recommendation source list 
  191             $SourceList[$ItemId] = $Row[
"Correlation"];
 
  194         # return recommendation source list to caller 
  198     # dynamically generate and return list of items similar to specified item 
  201         if ($this->
DebugLevel > 1) {  print(
"REC:  searching for items similar to item \"".$ItemId.
"\"<br>\n");  }
 
  203         # make sure we have item IDs available 
  206         # start with empty array 
  207         $SimilarItems = array();
 
  210         foreach ($this->ItemIds as $Id)
 
  212             # if item is not specified item 
  215                 # calculate correlation of item to specified item 
  218                 # if correlation is above threshold 
  219                 if ($Correlation > $this->ContentCorrelationThreshold)
 
  221                     # add item to list of similar items 
  222                     $SimilarItems[$Id] = $Correlation;
 
  226         if ($this->
DebugLevel > 3) {  print(
"REC:  ".count($SimilarItems).
" similar items to item \"".$ItemId.
"\" found<br>\n");  }
 
  228         # filter list of similar items (if any) 
  229         if (count($SimilarItems) > 0)
 
  232             if ($this->
DebugLevel > 4) {  print(
"REC:  ".count($SimilarItems).
" similar items to item \"".$ItemId.
"\" left after filtering<br>\n");  }
 
  235         # if any similar items left 
  236         if (count($SimilarItems) > 0)
 
  238             # sort list of similar items in order of most to least similar 
  239             arsort($SimilarItems, SORT_NUMERIC);
 
  242         # return list of similar items to caller 
  243         return $SimilarItems;
 
  246     # dynamically generate and return list of recommended field values for item 
  249         if ($this->
DebugLevel > 1) {  print(
"REC:  generating field value recommendations for item \"".$ItemId.
"\"<br>\n");  }
 
  251         # start with empty array of values 
  254         # generate list of similar items 
  257         # if similar items found 
  258         if (count($SimilarItems) > 0)
 
  260             # prune list of similar items to only top third of better-than-average 
  261             $AverageCorr = intval(array_sum($SimilarItems) / count($SimilarItems));
 
  262             reset($SimilarItems);
 
  263             $HighestCorr = current($SimilarItems);
 
  264             $CorrThreshold = intval($HighestCorr - (($HighestCorr - $AverageCorr) / 3));
 
  265             if ($this->
DebugLevel > 8) {  print(
"REC:  <i>Average Correlation: $AverageCorr      Highest Correlation: $HighestCorr      Correlation Threshold: $CorrThreshold </i><br>\n");  }
 
  266             foreach ($SimilarItems as $ItemId => $ItemCorr)
 
  268                 if ($ItemCorr < $CorrThreshold)
 
  270                     unset($SimilarItems[$ItemId]);
 
  273             if ($this->
DebugLevel > 6) {  print(
"REC:  ".count($SimilarItems).
" similar items left after threshold pruning<br>\n");  }
 
  276             foreach ($SimilarItems as $SimItemId => $SimItemCorr)
 
  279                 foreach ($this->ContentFields as $FieldName => $FieldAttributes)
 
  281                     # load field data for this item 
  282                     $FieldData = $this->GetFieldValue($SimItemId, $FieldName);
 
  284                     # if field data is array 
  285                     if (is_array($FieldData))
 
  287                         # for each field data value 
  288                         foreach ($FieldData as $FieldDataVal)
 
  290                             # if data value is not empty 
  291                             $FieldDataVal = trim($FieldDataVal);
 
  292                             if (strlen($FieldDataVal) > 0)
 
  294                                 # increment count for data value 
  295                                 $RecVals[$FieldName][$FieldDataVal]++;
 
  301                         # if data value is not empty 
  302                         $FieldData = trim($FieldData);
 
  303                         if (strlen($FieldData) > 0)
 
  305                             # increment count for data value 
  306                             $RecVals[$FieldName][$FieldData]++;
 
  313             $MatchingCountThreshold = 3;
 
  314             foreach ($RecVals as $FieldName => $FieldVals)
 
  316                 # determine cutoff threshold 
  317                 arsort($FieldVals, SORT_NUMERIC);
 
  319                 $HighestCount = current($FieldVals);
 
  320                 $AverageCount = intval(array_sum($FieldVals) / count($FieldVals));
 
  321                 $CountThreshold = intval($AverageCount + (($HighestCount - $AverageCount) / 2));
 
  322                 if ($CountThreshold < $MatchingCountThreshold) {  $CountThreshold = $MatchingCountThreshold;  }
 
  323                 if ($this->
DebugLevel > 8) {  print(
"REC:  <i>Field: $FieldName       Average Count: $AverageCount      Highest Count: $HighestCount      Count Threshold: $CountThreshold </i><br>\n");  }
 
  325                 # for each field data value 
  326                 foreach ($FieldVals as $FieldVal => $FieldValCount)
 
  328                     # if value count is below threshold 
  329                     if ($FieldValCount < $CountThreshold)
 
  332                         unset($RecVals[$FieldName][$FieldVal]);
 
  336                 if ($this->
DebugLevel > 3) {  print(
"REC:  found ".count($RecVals[$FieldName]).
" recommended values for field \"".$FieldName.
"\" after threshold pruning<br>\n");  }
 
  340         # return recommended values to caller 
  345     # ---- database update methods 
  349         if ($this->
DebugLevel > 0) {  print(
"REC:  UpdateForItems(${StartingItemId}, ${NumberOfItems})<br>\n");  }
 
  350         # make sure we have item IDs available 
  356         foreach ($this->ItemIds as $ItemId)
 
  358             # if item ID is within requested range 
  359             if ($ItemId >= $StartingItemId)
 
  361                 # update recommender info for item 
  362                 if ($this->
DebugLevel > 1) {  print(
"REC:  doing item ${ItemId}<br>\n");  }
 
  366                 # if we have done requested number of items 
  367                 if ($ItemsUpdated >= $NumberOfItems)
 
  370                     if ($this->
DebugLevel > 1) {  print(
"REC:  bailing out with item ${ItemId}<br>\n");  }
 
  376         # return ID of last resource updated to caller 
  382         if ($this->
DebugLevel > 1) {  print(
"REC:  updating for item \"".$ItemId.
"\"<br>\n");  }
 
  384         # make sure we have item IDs available 
  387         # clear existing correlations for this item 
  388         $this->DB->Query(
"DELETE FROM RecContentCorrelations " 
  389                 .
"WHERE ItemIdA = ${ItemId}");
 
  392         foreach ($this->ItemIds as $Id)
 
  394             # if full pass and item is later in list than current item 
  395             if (($FullPass == FALSE) || ($Id > $ItemId))
 
  397                 # update correlation value for item and target item 
  405         # drop all correlation entries referring to item 
  406         $this->DB->Query(
"DELETE FROM RecContentCorrelations " 
  407                          .
"WHERE ItemIdA = ".$ItemId.
" " 
  408                             .
"OR ItemIdB = ".$ItemId);
 
  413         # get average correlation 
  414         $AverageCorrelation = $this->DB->Query(
"SELECT AVG(Correlation) " 
  415                 .
"AS Average FROM RecContentCorrelations", 
"Average");
 
  417         # dump all below-average correlations 
  418         if ($AverageCorrelation > 0)
 
  420             $this->DB->Query(
"DELETE FROM RecContentCorrelations " 
  421                     .
"WHERE Correlation <= ${AverageCorrelation}");
 
  434             $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
" AS Id FROM " 
  435                     .$this->ItemTableName.
" ORDER BY ".$this->ItemIdFieldName);
 
  436             $ItemIds = $this->DB->FetchColumn(
"Id");
 
  442     # ---- PRIVATE INTERFACE ------------------------------------------------- 
  461         # if item IDs not already loaded 
  462         if (!isset($this->ItemIds))
 
  464             # load item IDs from DB 
  465             $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
" AS Id FROM " 
  466                     .$this->ItemTableName.
" ORDER BY ".$this->ItemIdFieldName);
 
  467             $this->ItemIds = array();
 
  468             while ($Item = $this->DB->FetchRow())
 
  470                 $this->ItemIds[] = $Item[
"Id"];
 
  479         # if data not already loaded 
  480         if (!isset($ItemData[$ItemId][$FieldName]))
 
  482             # load field value from DB 
  483             $FieldValue = $this->GetFieldValue($ItemId, $FieldName);
 
  485             # if field value is array 
  486             if (is_array($FieldValue))
 
  488                 # concatenate together text from array elements 
  489                 $FieldValue = implode(
" ", $FieldValue);
 
  492             # normalize text and break into word array 
  496         # return cached data to caller 
  497         return $ItemData[$ItemId][$FieldName];
 
  500     # calculate content correlation between two items and return value to caller 
  503         static $CorrelationCache;
 
  505         if ($this->
DebugLevel > 10) {  print(
"REC:  calculating correlation" 
  506                 .
" between items $ItemIdA and $ItemIdB<br>\n");  }
 
  508         # order item ID numbers 
  509         if ($ItemIdA > $ItemIdB)
 
  516         # if we already have the correlation 
  517         if (isset($CorrelationCache[$ItemIdA][$ItemIdB]))
 
  519             # retrieve correlation from cache 
  520             $TotalCorrelation = $CorrelationCache[$ItemIdA][$ItemIdB];
 
  524             # if list of fields to correlate specified 
  525             if ($FieldList != NULL)
 
  527                 # create list with only specified fields 
  528                 foreach ($FieldList as $FieldName)
 
  539             # for each content field 
  540             $TotalCorrelation = 0;
 
  543                 # if field is of a type that we use for correlation 
  544                 $FieldType = intval($FieldAttributes[
"FieldType"]);
 
  551                     if ($this->
DebugLevel > 15) {  print(
"REC:  loaded ".count($ItemAData).
" terms for item #".$ItemIdA.
" and ".count($ItemBData).
" terms for item #".$ItemIdB.
" for field \"".$FieldName.
"\"<br>\n");  }
 
  553                     # call appropriate routine to get correlation 
  559                                     $ItemAData, $ItemBData);
 
  563                     # add correlation multiplied by weight to total 
  564                     $TotalCorrelation += $Correlation * $FieldAttributes[
"Weight"];
 
  568             # store correlation to cache 
  569             $CorrelationCache[$ItemIdA][$ItemIdB] = $TotalCorrelation;
 
  572         # return correlation value to caller 
  573         if ($this->
DebugLevel > 9) {  print(
"REC:  correlation between items $ItemIdA and $ItemIdB found to be $TotalCorrelation<br>\n");  }
 
  574         return $TotalCorrelation;
 
  577     # calculate content correlation between two items and update in DB 
  580         if ($this->
DebugLevel > 6) {  print(
"REC:  updating correlation between items $ItemIdA and $ItemIdB<br>\n");  }
 
  582         # bail out if two items are the same 
  583         if ($ItemIdA == $ItemIdB) {  
return;  }
 
  585         # calculate correlation 
  588         # save new correlation 
  662         # strip any HTML tags 
  663         $Text = strip_tags($Text);
 
  665         # strip any punctuation 
  666         $Text = preg_replace(
"/,\\.\\?-\\(\\)\\[\\]\"/", 
" ", $Text);   # 
" 
  668         # normalize whitespace 
  669         $Text = trim(preg_replace("/[\\s]+/
", " ", $Text)); 
  671         # convert to all lower case 
  672         $Text = strtolower($Text); 
  674         # split text into arrays of words 
  675         $Words = explode(" ", $Text); 
  677         # filter out all stop words 
  678         $Words = array_diff($Words, $StopWords); 
  680         # return word array to caller 
  684     function CalcTextCorrelation($WordsA, $WordsB) 
  686         # get array containing intersection of two word arrays 
  687         $IntersectWords = array_intersect($WordsA, $WordsB); 
  689         # return number of words remaining as score 
  690         return count($IntersectWords); 
  693     function ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation = -1) 
  695         # if item ID A is greater than item ID B 
  696         if ($ItemIdA > $ItemIdB) 
  704         # if new correlation value provided 
  705         if ($NewCorrelation != -1) 
  707             # if new value is above threshold 
  708             if ($NewCorrelation >= $this->ContentCorrelationThreshold) 
  710                 # insert new correlation value in DB 
  711                 $this->DB->Query("INSERT INTO RecContentCorrelations 
" 
  712                         ."(ItemIdA, ItemIdB, Correlation) 
" 
  713                         ."VALUES (${ItemIdA}, ${ItemIdB}, ${NewCorrelation})
"); 
  715                 # return correlation value is new value 
  716                 $Correlation = $NewCorrelation; 
  721                 # return value is zero 
  727             # retrieve correlation value from DB 
  728             $Correlation = $this->DB->Query( 
  729                     "SELECT Correlation FROM RecContentCorrelations 
" 
  730                             ."WHERE ItemIdA = ${ItemIdA} AND ItemIdB = ${ItemIdB}
", 
  733             # if no value found in DB 
  734             if ($Correlation == FALSE) 
  736                 # return value is zero 
  741         # return correlation value to caller 
  745     function FilterOnSuppliedFunctions($Results) 
  747         # if filter functions have been set 
  748         if (count($this->FilterFuncs) > 0) 
  751             foreach ($Results as $ResourceId => $Result) 
  753                 # for each filter function 
  754                 foreach ($this->FilterFuncs as $FuncName) 
  756                     # if filter function return TRUE for result resource 
  757                     if ($FuncName($ResourceId)) 
  760                         if ($this->DebugLevel > 2) {  print("REC:      filter callback rejected resource ${ResourceId}<br>\n
");  } 
  761                         unset($Results[$ResourceId]); 
  763                         # bail out of filter func loop 
  770         # return filtered list to caller 
RecommendFieldValues($ItemId, $FieldList=NULL)
UpdateForItems($StartingItemId, $NumberOfItems)
GetSourceList($UserId, $RecommendedItemId)
ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation=-1)
const CONTENTFIELDTYPE_CONTROLLEDNAME
AddResultFilterFunction($FunctionName)
const CONTENTFIELDTYPE_DATE
FilterOnSuppliedFunctions($Results)
GetItemIds()
Retrieve all item IDs. 
UpdateForItem($ItemId, $FullPass=FALSE)
const CONTENTFIELDTYPE_NUMERIC
GetFieldData($ItemId, $FieldName)
UpdateContentCorrelation($ItemIdA, $ItemIdB)
Recommend($UserId, $StartingResult=0, $NumberOfResults=10)
CalcTextCorrelation($WordsA, $WordsB)
const CONTENTFIELDTYPE_TEXT
Recommender(&$DB, $ItemTableName, $RatingTableName, $ItemIdFieldName, $UserIdFieldName, $RatingFieldName, $ContentFields)
FindSimilarItems($ItemId, $FieldList=NULL)
const CONTENTFIELDTYPE_DATERAMGE
$NumberOfResultsAvailable
NormalizeAndParseText($Text)
CalculateContentCorrelation($ItemIdA, $ItemIdB, $FieldList=NULL)
$ContentCorrelationThreshold