00001 <?php
00025 class PorterStemmer
00026 {
00031 private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
00032
00033
00038 private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
00039
00040
00046 private static $cache = array();
00047
00048
00056 public static function Stem($word, $cache = false)
00057 {
00058 if (strlen($word) <= 2) {
00059 return $word;
00060 }
00061
00062
00063 if ($cache AND !empty(self::$cache[$word])) {
00064 return self::$cache[$word];
00065 }
00066
00070 $word = preg_replace("/('ve|n't|'d)$/", '', $word);
00071
00072 $stem = self::step1ab($word);
00073 $stem = self::step1c($stem);
00074 $stem = self::step2($stem);
00075 $stem = self::step3($stem);
00076 $stem = self::step4($stem);
00077 $stem = self::step5($stem);
00078
00079
00080 if ($cache) {
00081 self::$cache[$word] = $stem;
00082 }
00083
00084 return $stem;
00085 }
00086
00087
00091 private static function step1ab($word)
00092 {
00093
00094 if (substr($word, -1) == 's') {
00095
00096 self::replace($word, 'sses', 'ss')
00097 OR self::replace($word, 'ies', 'i')
00098 OR self::replace($word, 'ss', 'ss')
00099 OR self::replace($word, 's', '');
00100 }
00101
00102
00103 if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) {
00104 $v = self::$regex_vowel;
00105
00106
00107 if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
00108 OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) {
00109
00110
00111 if ( !self::replace($word, 'at', 'ate')
00112 AND !self::replace($word, 'bl', 'ble')
00113 AND !self::replace($word, 'iz', 'ize')) {
00114
00115
00116 if ( self::doubleConsonant($word)
00117 AND substr($word, -2) != 'll'
00118 AND substr($word, -2) != 'ss'
00119 AND substr($word, -2) != 'zz') {
00120
00121 $word = substr($word, 0, -1);
00122
00123 } else if (self::m($word) == 1 AND self::cvc($word)) {
00124 $word .= 'e';
00125 }
00126 }
00127 }
00128 }
00129
00130 return $word;
00131 }
00132
00133
00139 private static function step1c($word)
00140 {
00141 $v = self::$regex_vowel;
00142
00143 if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
00144 self::replace($word, 'y', 'i');
00145 }
00146
00147 return $word;
00148 }
00149
00150
00156 private static function step2($word)
00157 {
00158 switch (substr($word, -2, 1)) {
00159 case 'a':
00160 self::replace($word, 'ational', 'ate', 0)
00161 OR self::replace($word, 'tional', 'tion', 0);
00162 break;
00163
00164 case 'c':
00165 self::replace($word, 'enci', 'ence', 0)
00166 OR self::replace($word, 'anci', 'ance', 0);
00167 break;
00168
00169 case 'e':
00170 self::replace($word, 'izer', 'ize', 0);
00171 break;
00172
00173 case 'g':
00174 self::replace($word, 'logi', 'log', 0);
00175 break;
00176
00177 case 'l':
00178 self::replace($word, 'entli', 'ent', 0)
00179 OR self::replace($word, 'ousli', 'ous', 0)
00180 OR self::replace($word, 'alli', 'al', 0)
00181 OR self::replace($word, 'bli', 'ble', 0)
00182 OR self::replace($word, 'eli', 'e', 0);
00183 break;
00184
00185 case 'o':
00186 self::replace($word, 'ization', 'ize', 0)
00187 OR self::replace($word, 'ation', 'ate', 0)
00188 OR self::replace($word, 'ator', 'ate', 0);
00189 break;
00190
00191 case 's':
00192 self::replace($word, 'iveness', 'ive', 0)
00193 OR self::replace($word, 'fulness', 'ful', 0)
00194 OR self::replace($word, 'ousness', 'ous', 0)
00195 OR self::replace($word, 'alism', 'al', 0);
00196 break;
00197
00198 case 't':
00199 self::replace($word, 'biliti', 'ble', 0)
00200 OR self::replace($word, 'aliti', 'al', 0)
00201 OR self::replace($word, 'iviti', 'ive', 0);
00202 break;
00203 }
00204
00205 return $word;
00206 }
00207
00208
00214 private static function step3($word)
00215 {
00216 switch (substr($word, -2, 1)) {
00217 case 'a':
00218 self::replace($word, 'ical', 'ic', 0);
00219 break;
00220
00221 case 's':
00222 self::replace($word, 'alise', 'al', 0)
00223 OR self::replace($word, 'ness', '', 0);
00224 break;
00225
00226 case 't':
00227 self::replace($word, 'icate', 'ic', 0)
00228 OR self::replace($word, 'iciti', 'ic', 0);
00229 break;
00230
00231 case 'u':
00232 self::replace($word, 'ful', '', 0);
00233 break;
00234
00235 case 'v':
00236 self::replace($word, 'ative', '', 0);
00237 break;
00238
00239 case 'z':
00240 self::replace($word, 'alize', 'al', 0);
00241 break;
00242 }
00243
00244 return $word;
00245 }
00246
00247
00253 private static function step4($word)
00254 {
00255 switch (substr($word, -2, 1)) {
00256 case 'a':
00257 self::replace($word, 'al', '', 1);
00258 break;
00259
00260 case 'c':
00261 self::replace($word, 'ance', '', 1)
00262 OR self::replace($word, 'ence', '', 1);
00263 break;
00264
00265 case 'e':
00266 self::replace($word, 'er', '', 1);
00267 break;
00268
00269 case 'i':
00270 self::replace($word, 'ic', '', 1);
00271 break;
00272
00273 case 'l':
00274 self::replace($word, 'able', '', 1)
00275 OR self::replace($word, 'ible', '', 1);
00276 break;
00277
00278 case 'n':
00279 self::replace($word, 'ant', '', 1)
00280 OR self::replace($word, 'ement', '', 1)
00281 OR self::replace($word, 'ment', '', 1)
00282 OR self::replace($word, 'ent', '', 1);
00283 break;
00284
00285 case 'o':
00286 if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
00287 self::replace($word, 'ion', '', 1);
00288 } else {
00289 self::replace($word, 'ou', '', 1);
00290 }
00291 break;
00292
00293 case 's':
00294 self::replace($word, 'ism', '', 1);
00295 break;
00296
00297 case 't':
00298 self::replace($word, 'ate', '', 1)
00299 OR self::replace($word, 'iti', '', 1);
00300 break;
00301
00302 case 'u':
00303 self::replace($word, 'ous', '', 1);
00304 break;
00305
00306 case 'v':
00307 self::replace($word, 'ive', '', 1);
00308 break;
00309
00310 case 'z':
00311 self::replace($word, 'ize', '', 1);
00312 break;
00313 }
00314
00315 return $word;
00316 }
00317
00318
00324 private static function step5($word)
00325 {
00326
00327 if (substr($word, -1) == 'e') {
00328 if (self::m(substr($word, 0, -1)) > 1) {
00329 self::replace($word, 'e', '');
00330
00331 } else if (self::m(substr($word, 0, -1)) == 1) {
00332
00333 if (!self::cvc(substr($word, 0, -1))) {
00334 self::replace($word, 'e', '');
00335 }
00336 }
00337 }
00338
00339
00340 if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') {
00341 $word = substr($word, 0, -1);
00342 }
00343
00344 return $word;
00345 }
00346
00347
00360 private static function replace(&$str, $check, $repl, $m = null)
00361 {
00362 $len = 0 - strlen($check);
00363
00364 if (substr($str, $len) == $check) {
00365 $substr = substr($str, 0, $len);
00366 if (is_null($m) OR self::m($substr) > $m) {
00367 $str = $substr . $repl;
00368 }
00369
00370 return true;
00371 }
00372
00373 return false;
00374 }
00375
00376
00392 private static function m($str)
00393 {
00394 $c = self::$regex_consonant;
00395 $v = self::$regex_vowel;
00396
00397 $str = preg_replace("#^$c+#", '', $str);
00398 $str = preg_replace("#$v+$#", '', $str);
00399
00400 preg_match_all("#($v+$c+)#", $str, $matches);
00401
00402 return count($matches[1]);
00403 }
00404
00405
00413 private static function doubleConsonant($str)
00414 {
00415 $c = self::$regex_consonant;
00416
00417 return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
00418 }
00419
00420
00427 private static function cvc($str)
00428 {
00429 $c = self::$regex_consonant;
00430 $v = self::$regex_vowel;
00431
00432 return preg_match("#($c$v$c)$#", $str, $matches)
00433 AND strlen($matches[1]) == 3
00434 AND $matches[1]{2} != 'w'
00435 AND $matches[1]{2} != 'x'
00436 AND $matches[1]{2} != 'y';
00437 }
00438 }
00439 ?>