CWIS Developer Documentation
Recommender.php
Go to the documentation of this file.
1 <?PHP
2 
3 #
4 # FILE: SPT--Recommender.php
5 #
6 # METHODS PROVIDED:
7 # Recommender()
8 # - constructor
9 # SomeMethod($SomeParameter, $AnotherParameter)
10 # - short description of method
11 #
12 # AUTHOR: Edward Almasy
13 #
14 # Part of the Scout Portal Toolkit
15 # Copyright 2002-2004 Internet Scout Project
16 # http://scout.wisc.edu
17 #
18 
19 class Recommender {
20 
21  # ---- PUBLIC INTERFACE --------------------------------------------------
22  # define content field types
28 
29  # object constructor
33  {
34  # set default parameters
35  $this->ContentCorrelationThreshold = 1;
36 
37  # save database object
38  $this->DB =& $DB;
39 
40  # save new configuration values
41  $this->ItemTableName = $ItemTableName;
42  $this->RatingTableName = $RatingTableName;
43  $this->ItemIdFieldName = $ItemIdFieldName;
44  $this->UserIdFieldName = $UserIdFieldName;
45  $this->RatingFieldName = $RatingFieldName;
46  $this->ContentFields = $ContentFields;
47 
48  # set default debug state
49  $this->DebugLevel = 0;
50  }
51 
52  # set level for debugging output
53  function DebugLevel($Setting)
54  {
55  $this->DebugLevel = $Setting;
56  }
57 
58 
59  # ---- recommendation methods
60 
61  # recommend items for specified user
62  function Recommend($UserId, $StartingResult = 0, $NumberOfResults = 10)
63  {
64  if ($this->DebugLevel > 0) { print("REC: Recommend(${UserId}, ${StartingResult}, ${NumberOfResults})<br>\n"); }
65 
66  # load in user ratings
67  $Ratings = array();
68  $DB =& $this->DB;
69  $DB->Query("SELECT ".$this->ItemIdFieldName.", ".$this->RatingFieldName
70  ." FROM ".$this->RatingTableName
71  ." WHERE ".$this->UserIdFieldName." = ${UserId}");
72  while ($Row = $DB->FetchRow())
73  {
74  $Ratings[$Row[$this->ItemIdFieldName]] =
76  }
77  if ($this->DebugLevel > 1) { print("REC: user has rated ".count($Ratings)." items<br>\n"); }
78 
79  # for each item that user has rated
80  $RecVals = array();
81  foreach ($Ratings as $ItemId => $ItemRating)
82  {
83  # for each content correlation available for that item
84  $DB->Query("SELECT Correlation, ItemIdB "
85  ."FROM RecContentCorrelations "
86  ."WHERE ItemIdA = ${ItemId}");
87  while ($Row = $DB->FetchRow())
88  {
89  # multiply that correlation by normalized rating and add
90  # resulting value to recommendation value for that item
91  if (isset($RecVals[$Row["ItemIdB"]]))
92  {
93  $RecVals[$Row["ItemIdB"]] +=
94  $Row["Correlation"] * ($ItemRating - 50);
95  }
96  else
97  {
98  $RecVals[$Row["ItemIdB"]] =
99  $Row["Correlation"] * ($ItemRating - 50);
100  }
101  if ($this->DebugLevel > 9) { print("REC: RecVal[".$Row["ItemIdB"]."] = ".$RecVals[$Row["ItemIdB"]]."<br>\n"); }
102  }
103  }
104  if ($this->DebugLevel > 1) { print("REC: found ".count($RecVals)." total recommendations<br>\n"); }
105 
106  # calculate average correlation between items
107  $ResultThreshold = $DB->Query("SELECT AVG(Correlation) "
108  ."AS Average FROM RecContentCorrelations", "Average");
109  $ResultThreshold = round($ResultThreshold) * 2;
110 
111  # for each recommended item
112  foreach ($RecVals as $ItemId => $RecVal)
113  {
114  # remove item from list if user already rated it
115  if (isset($Ratings[$ItemId]))
116  {
117  unset($RecVals[$ItemId]);
118  }
119  else
120  {
121  # scale recommendation value back to match thresholds
122  $RecVals[$ItemId] = round($RecVal / 50);
123 
124  # remove item from recommendation list if value is below threshold
125  if ($RecVals[$ItemId] < $ResultThreshold)
126  {
127  unset($RecVals[$ItemId]);
128  }
129  }
130  }
131  if ($this->DebugLevel > 1) { print("REC: found ".count($RecVals)." positive recommendations<br>\n"); }
132 
133  # sort recommendation list by value
134  if (isset($RecVals)) { arsort($RecVals, SORT_NUMERIC); }
135 
136  # save total number of results available
137  $this->NumberOfResultsAvailable = count($RecVals);
138 
139  # trim result list to match range requested by caller
140  $RecValKeys = array_slice(
141  array_keys($RecVals), $StartingResult, $NumberOfResults);
142  $RecValSegment = array();
143  foreach ($RecValKeys as $Key)
144  {
145  $RecValSegment[$Key] = $RecVals[$Key];
146  }
147 
148  # return recommendation list to caller
149  return $RecValSegment;
150  }
151 
152  # add function to be called to filter returned recommendation list
153  function AddResultFilterFunction($FunctionName)
154  {
155  # save filter function name
156  $this->FilterFuncs[] = $FunctionName;
157  }
158 
159  # return number of recommendations generated
160  function NumberOfResults()
161  {
163  }
164 
165  # return recommendation generation time
166  function SearchTime()
167  {
168  return $this->LastSearchTime;
169  }
170 
171  # return list of items used to generate recommendation of specified item
172  function GetSourceList($UserId, $RecommendedItemId)
173  {
174  # pull list of correlations from DB
175  $this->DB->Query("SELECT * FROM RecContentCorrelations, ".$this->RatingTableName
176  ." WHERE (ItemIdA = ${RecommendedItemId}"
177  ." OR ItemIdB = ${RecommendedItemId})"
178  ." AND ".$this->UserIdFieldName." = ".$UserId
179  ." AND (RecContentCorrelations.ItemIdA = ".$this->RatingTableName.".".$this->ItemIdFieldName
180  ." OR RecContentCorrelations.ItemIdB = ".$this->RatingTableName.".".$this->ItemIdFieldName.")"
181  ." AND Rating >= 50 "
182  ." ORDER BY Correlation DESC");
183 
184  # for each correlation
185  $SourceList = array();
186  while ($Row = $this->DB->FetchRow())
187  {
188  # pick out appropriate item ID
189  if ($Row["ItemIdA"] == $RecommendedItemId)
190  {
191  $ItemId = $Row["ItemIdB"];
192  }
193  else
194  {
195  $ItemId = $Row["ItemIdA"];
196  }
197 
198  # add item to recommendation source list
199  $SourceList[$ItemId] = $Row["Correlation"];
200  }
201 
202  # return recommendation source list to caller
203  return $SourceList;
204  }
205 
206  # dynamically generate and return list of items similar to specified item
207  function FindSimilarItems($ItemId, $FieldList = NULL)
208  {
209  if ($this->DebugLevel > 1) { print("REC: searching for items similar to item \"".$ItemId."\"<br>\n"); }
210 
211  # make sure we have item IDs available
212  $this->LoadItemIds();
213 
214  # start with empty array
215  $SimilarItems = array();
216 
217  # for every item
218  foreach ($this->ItemIds as $Id)
219  {
220  # if item is not specified item
221  if ($Id != $ItemId)
222  {
223  # calculate correlation of item to specified item
224  $Correlation = $this->CalculateContentCorrelation($ItemId, $Id, $FieldList);
225 
226  # if correlation is above threshold
227  if ($Correlation > $this->ContentCorrelationThreshold)
228  {
229  # add item to list of similar items
230  $SimilarItems[$Id] = $Correlation;
231  }
232  }
233  }
234  if ($this->DebugLevel > 3) { print("REC: ".count($SimilarItems)." similar items to item \"".$ItemId."\" found<br>\n"); }
235 
236  # filter list of similar items (if any)
237  if (count($SimilarItems) > 0)
238  {
239  $SimilarItems = $this->FilterOnSuppliedFunctions($SimilarItems);
240  if ($this->DebugLevel > 4) { print("REC: ".count($SimilarItems)." similar items to item \"".$ItemId."\" left after filtering<br>\n"); }
241  }
242 
243  # if any similar items left
244  if (count($SimilarItems) > 0)
245  {
246  # sort list of similar items in order of most to least similar
247  arsort($SimilarItems, SORT_NUMERIC);
248  }
249 
250  # return list of similar items to caller
251  return $SimilarItems;
252  }
253 
254  # dynamically generate and return list of recommended field values for item
255  function RecommendFieldValues($ItemId, $FieldList = NULL)
256  {
257  if ($this->DebugLevel > 1) { print("REC: generating field value recommendations for item \"".$ItemId."\"<br>\n"); }
258 
259  # start with empty array of values
260  $RecVals = array();
261 
262  # generate list of similar items
263  $SimilarItems = $this->FindSimilarItems($ItemId, $FieldList);
264 
265  # if similar items found
266  if (count($SimilarItems) > 0)
267  {
268  # prune list of similar items to only top third of better-than-average
269  $AverageCorr = intval(array_sum($SimilarItems) / count($SimilarItems));
270  reset($SimilarItems);
271  $HighestCorr = current($SimilarItems);
272  $CorrThreshold = intval($HighestCorr - (($HighestCorr - $AverageCorr) / 3));
273  if ($this->DebugLevel > 8) { print("REC: <i>Average Correlation: $AverageCorr &nbsp;&nbsp;&nbsp;&nbsp; Highest Correlation: $HighestCorr &nbsp;&nbsp;&nbsp;&nbsp; Correlation Threshold: $CorrThreshold </i><br>\n"); }
274  foreach ($SimilarItems as $ItemId => $ItemCorr)
275  {
276  if ($ItemCorr < $CorrThreshold)
277  {
278  unset($SimilarItems[$ItemId]);
279  }
280  }
281  if ($this->DebugLevel > 6) { print("REC: ".count($SimilarItems)." similar items left after threshold pruning<br>\n"); }
282 
283  # for each item
284  foreach ($SimilarItems as $SimItemId => $SimItemCorr)
285  {
286  # for each field
287  foreach ($this->ContentFields as $FieldName => $FieldAttributes)
288  {
289  # load field data for this item
290  $FieldData = $this->GetFieldValue($SimItemId, $FieldName);
291 
292  # if field data is array
293  if (is_array($FieldData))
294  {
295  # for each field data value
296  foreach ($FieldData as $FieldDataVal)
297  {
298  # if data value is not empty
299  $FieldDataVal = trim($FieldDataVal);
300  if (strlen($FieldDataVal) > 0)
301  {
302  # increment count for data value
303  $RecVals[$FieldName][$FieldDataVal]++;
304  }
305  }
306  }
307  else
308  {
309  # if data value is not empty
310  $FieldData = trim($FieldData);
311  if (strlen($FieldData) > 0)
312  {
313  # increment count for data value
314  $RecVals[$FieldName][$FieldData]++;
315  }
316  }
317  }
318  }
319 
320  # for each field
321  $MatchingCountThreshold = 3;
322  foreach ($RecVals as $FieldName => $FieldVals)
323  {
324  # determine cutoff threshold
325  arsort($FieldVals, SORT_NUMERIC);
326  reset($FieldVals);
327  $HighestCount = current($FieldVals);
328  $AverageCount = intval(array_sum($FieldVals) / count($FieldVals));
329  $CountThreshold = intval($AverageCount + (($HighestCount - $AverageCount) / 2));
330  if ($CountThreshold < $MatchingCountThreshold) { $CountThreshold = $MatchingCountThreshold; }
331  if ($this->DebugLevel > 8) { print("REC: <i>Field: $FieldName &nbsp;&nbsp;&nbsp;&nbsp; Average Count: $AverageCount &nbsp;&nbsp;&nbsp;&nbsp; Highest Count: $HighestCount &nbsp;&nbsp;&nbsp;&nbsp; Count Threshold: $CountThreshold </i><br>\n"); }
332 
333  # for each field data value
334  foreach ($FieldVals as $FieldVal => $FieldValCount)
335  {
336  # if value count is below threshold
337  if ($FieldValCount < $CountThreshold)
338  {
339  # unset value
340  unset($RecVals[$FieldName][$FieldVal]);
341  }
342  }
343 
344  if ($this->DebugLevel > 3) { print("REC: found ".count($RecVals[$FieldName])." recommended values for field \"".$FieldName."\" after threshold pruning<br>\n"); }
345  }
346  }
347 
348  # return recommended values to caller
349  return $RecVals;
350  }
351 
352 
353  # ---- database update methods
354 
355  function UpdateForItems($StartingItemId, $NumberOfItems)
356  {
357  if ($this->DebugLevel > 0) { print("REC: UpdateForItems(${StartingItemId}, ${NumberOfItems})<br>\n"); }
358  # make sure we have item IDs available
359  $this->LoadItemIds();
360 
361  # for every item
362  $ItemsUpdated = 0;
363  $ItemId = NULL;
364  foreach ($this->ItemIds as $ItemId)
365  {
366  # if item ID is within requested range
367  if ($ItemId >= $StartingItemId)
368  {
369  # update recommender info for item
370  if ($this->DebugLevel > 1) { print("REC: doing item ${ItemId}<br>\n"); }
371  $this->UpdateForItem($ItemId, TRUE);
372  $ItemsUpdated++;
373 
374  # if we have done requested number of items
375  if ($ItemsUpdated >= $NumberOfItems)
376  {
377  # bail out
378  if ($this->DebugLevel > 1) { print("REC: bailing out with item ${ItemId}<br>\n"); }
379  return $ItemId;
380  }
381  }
382  }
383 
384  # return ID of last resource updated to caller
385  return $ItemId;
386  }
387 
388  function UpdateForItem($ItemId, $FullPass = FALSE)
389  {
390  if ($this->DebugLevel > 1) { print("REC: updating for item \"".$ItemId."\"<br>\n"); }
391 
392  # make sure we have item IDs available
393  $this->LoadItemIds();
394 
395  # clear existing correlations for this item
396  $this->DB->Query("DELETE FROM RecContentCorrelations "
397  ."WHERE ItemIdA = ${ItemId}");
398 
399  # for every item
400  foreach ($this->ItemIds as $Id)
401  {
402  # if full pass and item is later in list than current item
403  if (($FullPass == FALSE) || ($Id > $ItemId))
404  {
405  # update correlation value for item and target item
406  $this->UpdateContentCorrelation($ItemId, $Id);
407  }
408  }
409  }
410 
411  function DropItem($ItemId)
412  {
413  # drop all correlation entries referring to item
414  $this->DB->Query("DELETE FROM RecContentCorrelations "
415  ."WHERE ItemIdA = ".$ItemId." "
416  ."OR ItemIdB = ".$ItemId);
417  }
418 
419  function PruneCorrelations()
420  {
421  # get average correlation
422  $AverageCorrelation = $this->DB->Query("SELECT AVG(Correlation) "
423  ."AS Average FROM RecContentCorrelations", "Average");
424 
425  # dump all below-average correlations
426  if ($AverageCorrelation > 0)
427  {
428  $this->DB->Query("DELETE FROM RecContentCorrelations "
429  ."WHERE Correlation <= ${AverageCorrelation}");
430  }
431  }
432 
437  function GetItemIds()
438  {
439  static $ItemIds;
440  if (!isset($ItemIds))
441  {
442  $this->DB->Query("SELECT ".$this->ItemIdFieldName." AS Id FROM "
443  .$this->ItemTableName." ORDER BY ".$this->ItemIdFieldName);
444  $ItemIds = $this->DB->FetchColumn("Id");
445  }
446  return $ItemIds;
447  }
448 
449 
450  # ---- PRIVATE INTERFACE -------------------------------------------------
451 
459  var $ItemIds;
460  var $DB;
465 
466 
467  function LoadItemIds()
468  {
469  # if item IDs not already loaded
470  if (!isset($this->ItemIds))
471  {
472  # load item IDs from DB
473  $this->DB->Query("SELECT ".$this->ItemIdFieldName." AS Id FROM "
474  .$this->ItemTableName." ORDER BY ".$this->ItemIdFieldName);
475  $this->ItemIds = array();
476  while ($Item = $this->DB->FetchRow())
477  {
478  $this->ItemIds[] = $Item["Id"];
479  }
480  }
481  }
482 
483  function GetFieldData($ItemId, $FieldName)
484  {
485  static $ItemData;
486  static $CachedItemList;
487 
488  # if data not already loaded
489  if (!isset($ItemData[$ItemId][$FieldName]))
490  {
491  # load field value from DB
492  $FieldValue = $this->GetFieldValue($ItemId, $FieldName);
493 
494  # if field value is array
495  if (is_array($FieldValue))
496  {
497  # concatenate together text from array elements
498  $FieldValue = implode(" ", $FieldValue);
499  }
500 
501  # normalize text and break into word array
502  $ItemData[$ItemId][$FieldName] = $this->NormalizeAndParseText($FieldValue);
503 
504  # if more items than cache limit
505  if (count($ItemData) > 1000)
506  {
507  # dump oldest item
508  reset($ItemData);
509  list($DumpedItemId, $DumpedItemData) = each($ItemData);
510  unset($ItemData[$DumpedItemId]);
511  }
512  }
513 
514  # return cached data to caller
515  return $ItemData[$ItemId][$FieldName];
516  }
517 
518  # calculate content correlation between two items and return value to caller
519  function CalculateContentCorrelation($ItemIdA, $ItemIdB, $FieldList = NULL)
520  {
521  static $CorrelationCache;
522 
523  if ($this->DebugLevel > 10) { print("REC: calculating correlation between items $ItemIdA and $ItemIdB<br>\n"); }
524 
525  # order item ID numbers
526  if ($ItemIdA > $ItemIdB)
527  {
528  $Temp = $ItemIdA;
529  $ItemIdA = $ItemIdB;
530  $ItemIdB = $Temp;
531  }
532 
533  # if we already have the correlation
534  if (isset($CorrelationCache[$ItemIdA][$ItemIdB]))
535  {
536  # retrieve correlation from cache
537  $TotalCorrelation = $CorrelationCache[$ItemIdA][$ItemIdB];
538  }
539  else
540  {
541  # if list of fields to correlate specified
542  if ($FieldList != NULL)
543  {
544  # create list with only specified fields
545  foreach ($FieldList as $FieldName)
546  {
547  $ContentFields[$FieldName] = $this->ContentFields[$FieldName];
548  }
549  }
550  else
551  {
552  # use all fields
554  }
555 
556  # for each content field
557  $TotalCorrelation = 0;
558  foreach ($ContentFields as $FieldName => $FieldAttributes)
559  {
560  # if field is of a type that we use for correlation
561  $FieldType = intval($FieldAttributes["FieldType"]);
562  if (($FieldType == Recommender::CONTENTFIELDTYPE_TEXT)
564  {
565  # load data
566  $ItemAData = $this->GetFieldData($ItemIdA, $FieldName);
567  $ItemBData = $this->GetFieldData($ItemIdB, $FieldName);
568  if ($this->DebugLevel > 15) { print("REC: loaded ".count($ItemAData)." terms for item #".$ItemIdA." and ".count($ItemBData)." terms for item #".$ItemIdB." for field \"".$FieldName."\"<br>\n"); }
569 
570  # call appropriate routine to get correlation
571  switch ($FieldType)
572  {
575  $Correlation = $this->CalcTextCorrelation(
576  $ItemAData, $ItemBData);
577  break;
578  }
579 
580  # add correlation multiplied by weight to total
581  $TotalCorrelation += $Correlation * $FieldAttributes["Weight"];
582  }
583  }
584 
585  # store correlation to cache
586  $CorrelationCache[$ItemIdA][$ItemIdB] = $TotalCorrelation;
587  }
588 
589  # return correlation value to caller
590  if ($this->DebugLevel > 9) { print("REC: correlation between items $ItemIdA and $ItemIdB found to be $TotalCorrelation<br>\n"); }
591  return $TotalCorrelation;
592  }
593 
594  # calculate content correlation between two items and update in DB
595  function UpdateContentCorrelation($ItemIdA, $ItemIdB)
596  {
597  if ($this->DebugLevel > 6) { print("REC: updating correlation between items $ItemIdA and $ItemIdB<br>\n"); }
598 
599  # bail out if two items are the same
600  if ($ItemIdA == $ItemIdB) { return; }
601 
602  # calculate correlation
603  $Correlation = $this->CalculateContentCorrelation($ItemIdA, $ItemIdB);
604 
605  # save new correlation
606  $this->ContentCorrelation($ItemIdA, $ItemIdB, $Correlation);
607  }
608 
609  function NormalizeAndParseText($Text)
610  {
611  $StopWords = array(
612  "a",
613  "about",
614  "also",
615  "an",
616  "and",
617  "are",
618  "as",
619  "at",
620  "be",
621  "but",
622  "by",
623  "can",
624  "each",
625  "either",
626  "for",
627  "from",
628  "has",
629  "he",
630  "her",
631  "here",
632  "hers",
633  "him",
634  "his",
635  "how",
636  "i",
637  "if",
638  "in",
639  "include",
640  "into",
641  "is",
642  "it",
643  "its",
644  "me",
645  "neither",
646  "no",
647  "nor",
648  "not",
649  "of",
650  "on",
651  "or",
652  "so",
653  "she",
654  "than",
655  "that",
656  "the",
657  "their",
658  "them",
659  "then",
660  "there",
661  "these",
662  "they",
663  "this",
664  "those",
665  "through",
666  "to",
667  "too",
668  "very",
669  "what",
670  "when",
671  "where",
672  "while",
673  "who",
674  "why",
675  "will",
676  "you",
677  "");
678 
679  # strip any HTML tags
680  $Text = strip_tags($Text);
681 
682  # strip any punctuation
683  $Text = preg_replace("/,\\.\\?-\\(\\)\\[\\]\"/", " ", $Text); # "
684 
685  # normalize whitespace
686  $Text = trim(preg_replace("/[\\s]+/", " ", $Text));
687 
688  # convert to all lower case
689  $Text = strtolower($Text);
690 
691  # split text into arrays of words
692  $Words = explode(" ", $Text);
693 
694  # filter out all stop words
695  $Words = array_diff($Words, $StopWords);
696 
697  # return word array to caller
698  return $Words;
699  }
700 
701  function CalcTextCorrelation($WordsA, $WordsB)
702  {
703  # get array containing intersection of two word arrays
704  $IntersectWords = array_intersect($WordsA, $WordsB);
705 
706  # return number of words remaining as score
707  return count($IntersectWords);
708  }
709 
710  function ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation = -1)
711  {
712  # if item ID A is greater than item ID B
713  if ($ItemIdA > $ItemIdB)
714  {
715  # swap item IDs
716  $Temp = $ItemIdA;
717  $ItemIdA = $ItemIdB;
718  $ItemIdB = $Temp;
719  }
720 
721  # if new correlation value provided
722  if ($NewCorrelation != -1)
723  {
724  # if new value is above threshold
725  if ($NewCorrelation >= $this->ContentCorrelationThreshold)
726  {
727  # insert new correlation value in DB
728  $this->DB->Query("INSERT INTO RecContentCorrelations "
729  ."(ItemIdA, ItemIdB, Correlation) "
730  ."VALUES (${ItemIdA}, ${ItemIdB}, ${NewCorrelation})");
731 
732  # return correlation value is new value
733  $Correlation = $NewCorrelation;
734  }
735  # else
736  else
737  {
738  # return value is zero
739  $Correlation = 0;
740  }
741  }
742  else
743  {
744  # retrieve correlation value from DB
745  $Correlation = $this->DB->Query(
746  "SELECT Correlation FROM RecContentCorrelations "
747  ."WHERE ItemIdA = ${ItemIdA} AND ItemIdB = ${ItemIdB}",
748  "Correlation");
749 
750  # if no value found in DB
751  if ($Correlation == FALSE)
752  {
753  # return value is zero
754  $Correlation = 0;
755  }
756  }
757 
758  # return correlation value to caller
759  return $Correlation;
760  }
761 
762  function FilterOnSuppliedFunctions($Results)
763  {
764  # if filter functions have been set
765  if (count($this->FilterFuncs) > 0)
766  {
767  # for each result
768  foreach ($Results as $ResourceId => $Result)
769  {
770  # for each filter function
771  foreach ($this->FilterFuncs as $FuncName)
772  {
773  # if filter function return TRUE for result resource
774  if ($FuncName($ResourceId))
775  {
776  # discard result
777  if ($this->DebugLevel > 2) { print("REC: filter callback rejected resource ${ResourceId}<br>\n"); }
778  unset($Results[$ResourceId]);
779 
780  # bail out of filter func loop
781  continue 2;
782  }
783  }
784  }
785  }
786 
787  # return filtered list to caller
788  return $Results;
789  }
790 }
791 
792 ?>