00001 <?PHP
00002
00003 #
00004 # FILE: Scout--OAIClient.php
00005 # Provides a client for pulling data from OAI-PMH providers
00006 # For protocol documentation, see:
00007 # http://www.openarchives.org/OAI/openarchivesprotocol.html
00008 #
00009 # METHODS PROVIDED:
00010 # OAIClient(ServerUrl, Cache)
00011 # - constructor
00012 # ServerUrl(NewValue)
00013 # - Change the base url of the remote repository
00014 # MetadataPrefix($pfx)
00015 # - Set the schema we will request from remote
00016 # SetSpec($set)
00017 # - Restrict queries to a single set
00018 # for details, see
00019 # http://www.openarchives.org/OAI/openarchivesprotocol.html#Set
00020 # GetIdentification()
00021 # - Fetch identifying information about the remote repository
00022 # GetFormats()
00023 # - Fetch information about what schemas remote can serve
00024 # GetRecords($start,$end)
00025 # - Pull records in batches, optionally with date restrictions
00026 # GetRecord($id)
00027 # - Pull a single record using a unique identifier
00028 # MoreRecordsAvailable()
00029 # - Determine if a batch pull is complete or not
00030 # ResetRecordPointer()
00031 # - Restart a batch pull from the beginning
00032 # SetDebugLevel()
00033 # - Determine verbosity
00034 #
00035 # Copyright 2008 Edward Almasy and Internet Scout
00036 # http://scout.wisc.edu
00037 #
00038
00039 require_once("Scout--XMLParser.php");
00040
00041
00042 class OAIClient {
00043
00044 # ---- PUBLIC INTERFACE --------------------------------------------------
00045
00052 function OAIClient($ServerUrl, $Cache=NULL)
00053 {
00054 # set default debug level
00055 $this->DebugLevel = 0;
00056
00057 # save OAI server URL
00058 $this->ServerUrl = $ServerUrl;
00059
00060 # set default metadata prefix
00061 $this->MetadataPrefix = "oai_dc";
00062
00063 # set default set specification for queries
00064 $this->SetSpec = NULL;
00065
00066 $this->CacheSequenceNumber = 0;
00067 if ($Cache !== NULL)
00068 {
00069 $this->Cache = $Cache;
00070 $this->UsingCache = is_dir($Cache);
00071 if ($this->UsingCache == FALSE )
00072 {
00073 mkdir($Cache);
00074 }
00075 }
00076 }
00077
00084 function ServerUrl($NewValue = NULL)
00085 {
00086 if ($NewValue != NULL)
00087 {
00088 $this->ServerUrl = $NewValue;
00089 }
00090 return $this->ServerUrl;
00091 }
00092
00099 function MetadataPrefix($NewValue = NULL)
00100 {
00101 if ($NewValue != NULL)
00102 {
00103 $this->MetadataPrefix = $NewValue;
00104 }
00105 return $this->MetadataPrefix;
00106 }
00107
00114 function SetSpec($NewValue = "X-NOSETSPECVALUE-X")
00115 {
00116 if ($NewValue != "X-NOSETSPECVALUE-X")
00117 {
00118 $this->SetSpec = $NewValue;
00119 }
00120 return $this->SetSpec;
00121 }
00122
00130 function GetIdentification()
00131 {
00132 # query server for XML text
00133 $XmlText = $this->PerformQuery("Identify");
00134 $this->DebugOutVar(8,__METHOD__,"XmlText",htmlspecialchars($XmlText));
00135
00136 # convert XML text into object
00137 $Xml = simplexml_load_string($XmlText);
00138 $this->DebugOutVar(9, __METHOD__, "Xml", $Xml);
00139
00140 # if identification info was found
00141 $Info = array();
00142 if (isset($Xml->Identify))
00143 {
00144 # extract info
00145 $Ident = $Xml->Identify;
00146 $this->GetValFromXml($Ident, "repositoryName", "Name", $Info);
00147 $this->GetValFromXml($Ident, "adminEmail", "Email", $Info);
00148 $this->GetValFromXml($Ident, "baseURL", "URL", $Info);
00149 }
00150
00151 # return info to caller
00152 return $Info;
00153 }
00154
00160 function GetFormats()
00161 {
00162 # query server for XML text
00163 $XmlText = $this->PerformQuery("ListMetadataFormats");
00164 $this->DebugOutVar(8,__METHOD__,"XmlText",htmlspecialchars($XmlText));
00165
00166 # convert XML text into object
00167 $Xml = simplexml_load_string($XmlText);
00168 $this->DebugOutVar(9, __METHOD__, "Xml", $Xml);
00169
00170 # if format info was found
00171 $Formats = array();
00172 if (isset($Xml->ListMetadataFormats->metadataFormat))
00173 {
00174 # extract info
00175 $Index = 0;
00176 foreach ($Xml->ListMetadataFormats->metadataFormat as $Format)
00177 {
00178 $this->GetValFromXml(
00179 $Format, "metadataPrefix", "Name", $Formats[$Index]);
00180 $this->GetValFromXml(
00181 $Format, "schema", "Schema", $Formats[$Index]);
00182 $this->GetValFromXml(
00183 $Format, "metadataNamespace", "Namespace",
00184 $Formats[$Index]);
00185 $Index++;
00186 }
00187 }
00188
00189 # return info to caller
00190 return $Formats;
00191 }
00192
00200 function GetRecords($StartDate = NULL, $EndDate = NULL)
00201 {
00202 if( $this->Cache != NULL )
00203 {
00204 $cache_fname = sprintf("%s/%010x",
00205 $this->Cache,
00206 $this->CacheSequenceNumber);
00207 $this->CacheSequenceNumber++;
00208 }
00209
00210 if( $this->Cache == NULL or $this->UsingCache == FALSE )
00211 {
00212 # if we have resumption token from prior query
00213 if (isset($this->ResumptionToken))
00214 {
00215 # use resumption token as sole argument
00216 $Args["resumptionToken"] = $this->ResumptionToken;
00217 }
00218 else
00219 {
00220 # set up arguments for query
00221 $Args["metadataPrefix"] = $this->MetadataPrefix;
00222 if ($StartDate) { $Args["from"] = $StartDate; }
00223 if ($EndDate) { $Args["until"] = $EndDate; }
00224 if ($this->SetSpec) { $Args["set"] = $this->SetSpec; }
00225 }
00226
00227 # query server for XML text
00228 $XmlText = $this->PerformQuery("ListRecords", $Args);
00229
00230 if( $this->Cache != NULL )
00231 {
00232 file_put_contents( $cache_fname, $XmlText );
00233 }
00234 }
00235 else
00236 {
00237 # Get XML text from the cache
00238 $XmlText = file_get_contents( $cache_fname );
00239 }
00240
00241 $this->DebugOutVar(8, __METHOD__,"XmlText",htmlspecialchars($XmlText));
00242
00243 return $this->GetRecordsFromXML($XmlText, "listrecords" );
00244 }
00245
00260 function GetRecord($Id)
00261 {
00262 $Args["metadataPrefix"] = $this->MetadataPrefix;
00263 $Args["identifier"] = $Id;
00264
00265 # query server for XML text
00266 $XmlText = $this->PerformQuery("GetRecord", $Args);
00267 $this->DebugOutVar(8, __METHOD__,"XmlText",htmlspecialchars($XmlText));
00268
00269 return $this->GetRecordsFromXML($XmlText, "getrecord" );
00270 }
00271
00277 function MoreRecordsAvailable()
00278 {
00279 return isset($this->ResumptionToken) ? TRUE : FALSE;
00280 }
00281
00285 function ResetRecordPointer()
00286 {
00287 unset($this->ResumptionToken);
00288 $this->CacheSequenceNumber = 0;
00289 }
00290
00296 function SetDebugLevel($NewLevel)
00297 {
00298 $this->DebugLevel = $NewLevel;
00299 }
00300
00301
00302 # ---- PRIVATE INTERFACE -------------------------------------------------
00303
00304 private $ServerUrl;
00305 private $MetadataPrefix;
00306 private $SetSpec;
00307 private $DebugLevel;
00308 private $ResumptionToken;
00309 private $Cache;
00310 private $UsingCache;
00311 private $CacheSequenceNumber;
00312
00313 # perform OAI query and return resulting data to caller
00314 private function PerformQuery($QueryVerb, $Args = NULL)
00315 {
00316 # open stream to OAI server
00317 $QueryUrl = $this->ServerUrl."?verb=".$QueryVerb;
00318 if ($Args)
00319 {
00320 foreach ($Args as $ArgName => $ArgValue)
00321 {
00322 $QueryUrl .= "&".urlencode($ArgName)."=".urlencode($ArgValue);
00323 }
00324 }
00325 $FHndl = fopen($QueryUrl, "r");
00326
00327 # if stream was successfully opened
00328 $Text = "";
00329 if ($FHndl !== FALSE)
00330 {
00331 # while lines left in response
00332 while (!feof($FHndl))
00333 {
00334 # read line from server and add it to text to be parsed
00335 $Text .= fread($FHndl, 10000000);
00336 }
00337 }
00338
00339 # close OAI server stream
00340 fclose($FHndl);
00341
00342 # return query result data to caller
00343 return $Text;
00344 }
00345
00346 # set array value if available in simplexml object
00347 private function GetValFromXml($Xml, $SrcName, $DstName, &$Results)
00348 {
00349 if (isset($Xml->$SrcName))
00350 {
00351 $Results[$DstName] = trim($Xml->$SrcName);
00352 }
00353 }
00354
00355 # print variable contents if debug is above specified level
00356 private function DebugOutVar($Level, $MethodName, $VarName, $VarValue)
00357 {
00358 if ($this->DebugLevel >= $Level)
00359 {
00360 print("\n<pre>".$MethodName."() ".$VarName." = \n");
00361 print_r($VarValue);
00362 print("</pre>\n");
00363 }
00364 }
00365
00366 # Query has been sent, we need to retrieve records that came from it.
00367 private function GetRecordsFromXML($XmlText, $ParseTo ){
00368 # create XML parser and pass it text
00369 $Parser = new XMLParser();
00370 $Parser->ParseText($XmlText);
00371 $this->DebugOutVar(9, __METHOD__, "Parser", $Parser);
00372
00373 # if records were found
00374 $Records = array();
00375 $ItemCount = $Parser->SeekTo("oai-pmh", $ParseTo, "record");
00376 if ($ItemCount)
00377 {
00378 # for each record
00379 $Index = 0;
00380 do
00381 {
00382 # grab record identifier and date
00383 $Records[$Index]["identifier"]=$Parser->GetData("header",
00384 "identifier");
00385 $Records[$Index]["datestamp"]=$Parser->GetData("header",
00386 "datestamp");
00387
00388 # grab metadata
00389 $SeekResult = $Parser->SeekTo("metadata");
00390 if ($SeekResult)
00391 {
00392 $SeekResult = $Parser->SeekToChild();
00393 if ($SeekResult)
00394 {
00395 $Records[$Index]["format"] = $Parser->GetTagName();
00396 $SeekResult = $Parser->SeekToChild();
00397 if ($SeekResult)
00398 {
00399 $TagName = $Parser->GetTagName();
00400 do
00401 {
00402 $Records[$Index]["metadata"][$TagName][] =
00403 $Parser->GetData();
00404 } while ($TagName = $Parser->NextTag());
00405 $Parser->SeekToParent();
00406 }
00407 $Parser->SeekToParent();
00408 }
00409 $Parser->SeekToParent();
00410 }
00411
00412 # grab search info (if any)
00413 $SeekResult = $Parser->SeekTo("about");
00414 if ($SeekResult)
00415 {
00416 $SeekResult = $Parser->SeekTo("searchInfo");
00417 if ($SeekResult)
00418 {
00419 $SeekResult = $Parser->SeekToChild();
00420 if ($SeekResult)
00421 {
00422 $TagName = $Parser->GetTagName();
00423 do
00424 {
00425 $Records[$Index]["about"]["SEARCHINFO"][$TagName][] =
00426 $Parser->GetData();
00427 } while ($TagName = $Parser->NextTag());
00428 $Parser->SeekToParent();
00429 }
00430 $Parser->SeekToParent();
00431 }
00432 $Parser->SeekToParent();
00433 }
00434
00435 $Index++;
00436 }
00437 while ($Parser->NextItem());
00438 }
00439
00440 # look for resumption token and save if found
00441 $Parser->SeekToRoot();
00442 $SeekResult = $Parser->SeekTo(
00443 "oai-pmh", "listrecords", "resumptiontoken");
00444 if ($SeekResult !== NULL)
00445 {
00446 $this->ResumptionToken = $Parser->GetData();
00447 }
00448 else
00449 {
00450 unset($this->ResumptionToken);
00451 }
00452
00453 # return records to caller
00454 return $Records;
00455 }
00456
00457 }
00458
00459 ?>