RSSClient.php
Go to the documentation of this file.00001 <?PHP
00002
00003 #
00004 # FILE: Scout--RSSClient.php
00005 #
00006 # METHODS PROVIDED:
00007 # RSSClient()
00008 # - constructor
00009 # SomeMethod($SomeParameter, $AnotherParameter)
00010 # - short description of method
00011 #
00012 # AUTHOR: Edward Almasy
00013 #
00014 # Copyright 2005 Internet Scout Project
00015 # http://scout.wisc.edu
00016 #
00017
00018 class RSSClient {
00019
00020 # ---- PUBLIC INTERFACE --------------------------------------------------
00021
00022 # object constructor
00023 function RSSClient($ServerUrl, $CacheDB = NULL, $RefreshTime = 600, $Encoding = "UTF-8", $DebugLevel = 0)
00024 {
00025 # set default debug level
00026 $this->DebugLevel = $DebugLevel;
00027
00028 # set default encoding
00029 $this->Encoding = $Encoding;
00030
00031 # save cache details
00032 $this->CacheDB = $CacheDB;
00033 $this->RefreshTime = $RefreshTime;
00034
00035 # query server (or cache) for XML text
00036 $this->XmlText = $this->QueryServerWithCaching(
00037 $ServerUrl, $CacheDB, $RefreshTime);
00038
00039 # create XML parser and parse text
00040 $this->Parser = new XMLParser($this->Encoding);
00041 if ($this->DebugLevel > 3) { $Parser->SetDebugLevel($this->DebugLevel - 3); }
00042 $this->Parser->ParseText($this->XmlText);
00043
00044 if ($this->DebugLevel) { print("RSSClient->RSSClient() returned ".strlen($this->XmlText)." characters from server query<br>\n"); }
00045 }
00046
00047 # get/set server URL
00048 function ServerUrl($NewValue = NULL)
00049 {
00050 # if new RSS server URL supplied
00051 if (($NewValue != NULL) && ($NewValue != $this->ServerUrl))
00052 {
00053 # save new value
00054 $this->ServerUrl = $NewValue;
00055
00056 # re-read XML from server at new URL
00057 $this->XmlText = $this->QueryServerWithCaching(
00058 $NewValue,
00059 $this->CacheDB,
00060 $this->RefreshTime);
00061
00062 # create new XML parser and parse text
00063 $this->Parser = new XMLParser();
00064 if ($this->DebugLevel > 3) { $Parser->SetDebugLevel($this->DebugLevel - 3); }
00065 $this->Parser->ParseText($this->XmlText);
00066 }
00067
00068 # return RSS server URL to caller
00069 return $this->ServerUrl;
00070 }
00071
00072 # get/set encoding
00073 function Encoding($NewValue = NULL)
00074 {
00075 # if new encoding supplied
00076 if (($NewValue != NULL) && ($NewValue != $this->Encoding))
00077 {
00078 # save new value
00079 $this->Encoding = $NewValue;
00080
00081 # re-read XML from server
00082 $this->XmlText = $this->QueryServerWithCaching(
00083 $this->ServerUrl,
00084 $this->CacheDB,
00085 $this->RefreshTime);
00086
00087 # create new XML parser and parse text
00088 $this->Parser = new XMLParser($this->Encoding);
00089 if ($this->DebugLevel > 3) { $Parser->SetDebugLevel($this->DebugLevel - 3); }
00090 $this->Parser->ParseText($this->XmlText);
00091 }
00092
00093 # return encoding to caller
00094 return $this->Encoding;
00095 }
00096
00102 function AutodetectEncoding()
00103 {
00104 # if neither the XML file nor the HTTP response headers specify an
00105 # encoding, there is an overwhelming chance that it's ISO-8859-1, so
00106 # use it as the default
00107 $Encoding = "ISO-8859-1";
00108
00109 # only get up to the the encoding portion of the XML declartion
00110 # http://www.w3.org/TR/2006/REC-xml-20060816/#sec-prolog-dtd
00111 $S = '[ \t\r\n]';
00112 $Eq = "{$S}?={$S}?";
00113 $VersionNum = '1.0';
00114 $EncName = '[A-Za-z]([A-Za-z0-9._]|-)*';
00115 $VersionInfo = "{$S}version{$Eq}('{$VersionNum}'|\"{$VersionNum}\")";
00116 $EncodingDecl = "{$S}encoding{$Eq}('{$EncName}'|\"{$EncName}\")";
00117 $XMLDecl = "<\?xml{$VersionInfo}({$EncodingDecl})?";
00118 $RegEx = "/{$XMLDecl}/";
00119
00120 # try to find the encoding, index 3 will be set if encoding is declared
00121 preg_match($RegEx, $this->XmlText, $Matches);
00122
00123 # give precedence to the encoding specified within the XML file since
00124 # a RSS feed publisher might not have access to HTTP response headers
00125 if (count($Matches) >= 4)
00126 {
00127 # also need to strip off the quotes
00128 $Encoding = trim($Matches[3], "'\"");
00129 }
00130
00131 # then give precedence to the charset parameter in the Content-Type
00132 # response header
00133 else if ($this->CacheDB)
00134 {
00135 # create cache table if it doesn't exist
00136 $DB = $this->CacheDB;
00137 $ServerUrl = addslashes($this->ServerUrl);
00138
00139 # get the cache value
00140 $DB->Query("
00141 SELECT * FROM RSSClientCache
00142 WHERE ServerUrl = '".$ServerUrl."'");
00143 $Exists = ($DB->NumRowsSelected() > 0);
00144 $Cache = $DB->FetchRow();
00145
00146 # if cached and charset parameter was given in the response headers
00147 if ($Exists && strlen($Cache["Charset"]))
00148 {
00149 $Encoding = $Cache["Charset"];
00150 }
00151 }
00152
00153 $this->Encoding($Encoding);
00154 }
00155
00156 # retrieve RSS items (from first channel if not otherwise specified)
00157 function GetItems($NumberOfItems = NULL, $ChannelName = NULL)
00158 {
00159 # start by assuming no items will be found
00160 $Items = array();
00161
00162 # move parser to area in XML with items
00163 $Parser = $this->Parser;
00164 $Parser->SeekToRoot();
00165 $Result = $Parser->SeekTo("rss");
00166 if ($Result === NULL)
00167 {
00168 $Result = $Parser->SeekTo("rdf:RDF");
00169 }
00170 else
00171 {
00172 $Parser->SeekTo("channel");
00173 }
00174
00175 # if items are found
00176 $ItemCount = $Parser->SeekTo("item");
00177 if ($ItemCount)
00178 {
00179 # for each record
00180 $Index = 0;
00181 do
00182 {
00183 # retrieve item info
00184 $Items[$Index]["title"] = $Parser->GetData("title");
00185 $Items[$Index]["description"] = $Parser->GetData("description");
00186 $Items[$Index]["link"] = $Parser->GetData("link");
00187 $Items[$Index]["enclosure"] = $Parser->GetAttributes("enclosure");
00188
00189 $Index++;
00190 }
00191 while ($Parser->NextItem() && (($NumberOfItems == NULL) || ($Index < $NumberOfItems)));
00192 }
00193
00194 # return records to caller
00195 return $Items;
00196 }
00197
00198 # retrieve site name as given in feed
00199 function GetChannelTitle()
00200 {
00201 if (!isset($this->ChannelTitle)) { $this->LoadChannelInfo(); }
00202 return $this->ChannelTitle;
00203 }
00204
00205 # retrieve site link as given in feed
00206 function GetChannelLink()
00207 {
00208 if (!isset($this->ChannelLink)) { $this->LoadChannelInfo(); }
00209 return $this->ChannelLink;
00210 }
00211
00212 # retrieve site description as given in feed
00213 function GetChannelDescription()
00214 {
00215 if (!isset($this->ChannelDescription)) { $this->LoadChannelInfo(); }
00216 return $this->ChannelDescription;
00217 }
00218
00219 # tell caller whether client is using cached data
00220 function UsedCachedData()
00221 {
00222 return $this->CachedDataWasUsed;
00223 }
00224
00225
00226 # ---- PRIVATE INTERFACE -------------------------------------------------
00227
00228 var $CacheDB;
00229 var $RefreshTime;
00230 var $ServerUrl;
00231 var $MetadataPrefix;
00232 var $SetSpec;
00233 var $DebugLevel;
00234 var $Encoding;
00235 var $XmlText;
00236 var $Parser;
00237 var $ChannelTitle;
00238 var $ChannelLink;
00239 var $ChannelDescription;
00240 var $CachedDataWasUsed;
00241
00242 # set current debug output level (0-9)
00243 function SetDebugLevel($NewLevel)
00244 {
00245 $this->DebugLevel = $NewLevel;
00246 }
00247
00257 function GetXmlInfo($Url)
00258 {
00259 $Text = @file_get_contents($Url);
00260 $Type = NULL;
00261 $Charset = NULL;
00262
00263 # get the type and charset if the fetch was successful
00264 if ($Text !== FALSE)
00265 {
00266 # this must come after file_get_contents() and before any other remote
00267 # fetching is done
00268 $Headers = $http_response_header;
00269
00270 # http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17
00271 $LWS = '([ \t]*|\r\n[ \t]+)';
00272 $Token = '[!\x23-\x27*+-.\x30-\x39\x41-\x5A\x5E-\x7A|~]+';
00273 $QuotedPair = '\\[\x00-\x7F]';
00274 $QdText = "([^\\x00-\\x1F\\x7F\"]|{$LWS})";
00275 $QuotedString = "\"({$QdText}|{$QuotedPair})*\"";
00276 $Value = "({$Token}|{$QuotedString})";
00277 $Parameter = "{$Token}{$LWS}={$LWS}{$Value}";
00278
00279 # these make the Content-Type regex specific to Content-Type
00280 # values with charset parameters in them, but make capturing
00281 # the charset much easier
00282 $BasicParameter = "(;{$LWS}{$Parameter})*";
00283 $CharsetParameter = "(;{$LWS}charset{$LWS}={$LWS}{$Value})";
00284 $ModParameter = "{$BasicParameter}{$CharsetParameter}{$BasicParameter}";
00285 $MediaType = "({$Token}{$LWS}\\/{$LWS}{$Token}){$LWS}{$ModParameter}";
00286
00287 # back to the spec
00288 $ContentType = "Content-Type{$LWS}:{$LWS}{$MediaType}{$LWS}";
00289 $RegEx = "/^{$ContentType}$/i";
00290
00291 foreach ($Headers as $Header)
00292 {
00293 preg_match($RegEx, $Header, $Matches);
00294
00295 if (isset($Matches[3]) && isset($Matches[19]))
00296 {
00297 $Type = $Matches[3];
00298 $Charset = $Matches[19];
00299 break;
00300 }
00301 }
00302 }
00303
00304 return array($Text, $Type, $Charset);
00305 }
00306
00307 # load RSS XML from server or cache
00308 function QueryServerWithCaching($ServerUrl, $CacheDB, $RefreshTime)
00309 {
00310 # save RSS server URL
00311 $this->ServerUrl = $ServerUrl;
00312
00313 # save caching info (if any)
00314 if ($CacheDB)
00315 {
00316 $this->CacheDB = $CacheDB;
00317 }
00318
00319 # if caching info was supplied
00320 if ($this->CacheDB)
00321 {
00322 $DB = $this->CacheDB;
00323
00324 # look up cached information for this server
00325 $QueryTimeCutoff = date("Y-m-d H:i:s", (time() - $RefreshTime));
00326 $DB->Query("
00327 SELECT * FROM RSSClientCache
00328 WHERE ServerUrl = '".addslashes($ServerUrl)."'
00329 AND LastQueryTime > '".$QueryTimeCutoff."'");
00330
00331 # if we have cached info that has not expired
00332 if ($CachedXml = $DB->FetchField("CachedXml"))
00333 {
00334 # use cached info
00335 $QueryResult = $CachedXml;
00336 $this->CachedDataWasUsed = TRUE;
00337 }
00338 else
00339 {
00340 $this->CachedDataWasUsed = FALSE;
00341
00342 # query server for XML text
00343 list($Text, $Type, $Charset) = $this->GetXmlInfo($ServerUrl);
00344 $QueryResult = "";
00345
00346 # if query was successful
00347 if ($Text !== FALSE)
00348 {
00349 $QueryResult = $Text;
00350
00351 # clear out any old cache entries
00352 $DB->Query("
00353 DELETE FROM RSSClientCache
00354 WHERE ServerUrl = '".addslashes($ServerUrl)."'");
00355
00356 # save info in cache
00357 $DB->Query("
00358 INSERT INTO RSSClientCache
00359 (ServerUrl, CachedXml, Type, Charset, LastQueryTime)
00360 VALUES (
00361 '".addslashes($ServerUrl)."',
00362 '".addslashes($Text)."',
00363 '".addslashes($Type)."',
00364 '".addslashes($Charset)."',
00365 NOW())");
00366 }
00367 }
00368 }
00369
00370 # return query result to caller
00371 return $QueryResult;
00372 }
00373
00374 function LoadChannelInfo()
00375 {
00376 $Parser = $this->Parser;
00377 $Parser->SeekToRoot();
00378 $Result = $Parser->SeekTo("rss");
00379 if ($Result === NULL)
00380 {
00381 $Result = $Parser->SeekTo("rdf:RDF");
00382 }
00383 $Parser->SeekTo("channel");
00384 $this->ChannelTitle = $Parser->GetData("title");
00385 $this->ChannelLink = $Parser->GetData("link");
00386 $this->ChannelDescription = $Parser->GetData("description");
00387 }
00388 }