CWIS Developer Documentation
OAIClient.php
Go to the documentation of this file.
1 <?PHP
2 #
3 # FILE: OAIClient.php
4 # Provides a client for pulling data from OAI-PMH providers
5 # For protocol documentation, see:
6 # http://www.openarchives.org/OAI/openarchivesprotocol.html
7 #
8 # METHODS PROVIDED:
9 # OAIClient(ServerUrl, Cache)
10 # - constructor
11 # ServerUrl(NewValue)
12 # - Change the base url of the remote repository
13 # MetadataPrefix($pfx)
14 # - Set the schema we will request from remote
15 # SetSpec($set)
16 # - Restrict queries to a single set
17 # for details, see
18 # http://www.openarchives.org/OAI/openarchivesprotocol.html#Set
19 # GetIdentification()
20 # - Fetch identifying information about the remote repository
21 # GetFormats()
22 # - Fetch information about what schemas remote can serve
23 # GetRecords($start,$end)
24 # - Pull records in batches, optionally with date restrictions
25 # GetRecord($id)
26 # - Pull a single record using a unique identifier
27 # MoreRecordsAvailable()
28 # - Determine if a batch pull is complete or not
29 # ResetRecordPointer()
30 # - Restart a batch pull from the beginning
31 # SetDebugLevel()
32 # - Determine verbosity
33 #
34 # Copyright 2014 Edward Almasy and Internet Scout
35 # http://scout.wisc.edu
36 #
37 
38 class OAIClient
39 {
40 
41  # ---- PUBLIC INTERFACE --------------------------------------------------
42 
49  public function __construct($ServerUrl, $Cache=NULL)
50  {
51  # set default debug level
52  $this->DebugLevel = 0;
53 
54  # save OAI server URL
55  $this->ServerUrl = $ServerUrl;
56 
57  # set default metadata prefix
58  $this->MetadataPrefix = "oai_dc";
59 
60  # set default set specification for queries
61  $this->SetSpec = NULL;
62 
63  $this->CacheSequenceNumber = 0;
64  if ($Cache !== NULL)
65  {
66  $this->Cache = $Cache;
67  if (!is_dir($Cache))
68  {
69  mkdir($Cache);
70  }
71  }
72  }
73 
79  public function ServerUrl($NewValue = NULL)
80  {
81  if ($NewValue != NULL)
82  {
83  $this->ServerUrl = $NewValue;
84  }
85  return $this->ServerUrl;
86  }
87 
93  public function MetadataPrefix($NewValue = NULL)
94  {
95  if ($NewValue != NULL)
96  {
97  $this->MetadataPrefix = $NewValue;
98  }
99  return $this->MetadataPrefix;
100  }
101 
107  public function SetSpec($NewValue = "X-NOSETSPECVALUE-X")
108  {
109  if ($NewValue != "X-NOSETSPECVALUE-X")
110  {
111  $this->SetSpec = $NewValue;
112  }
113  return $this->SetSpec;
114  }
115 
123  public function GetIdentification()
124  {
125  # query server for XML text
126  $XmlText = $this->PerformQuery("Identify");
127  $this->DebugOutVar(8, __METHOD__, "XmlText", htmlspecialchars($XmlText));
128 
129  # convert XML text into object
130  $Xml = simplexml_load_string($XmlText);
131  $this->DebugOutVar(9, __METHOD__, "Xml", $Xml);
132 
133  # if identification info was found
134  $Info = array();
135  if (isset($Xml->Identify))
136  {
137  # extract info
138  $Ident = $Xml->Identify;
139  $this->GetValFromXml($Ident, "repositoryName", "Name", $Info);
140  $this->GetValFromXml($Ident, "adminEmail", "Email", $Info);
141  $this->GetValFromXml($Ident, "baseURL", "URL", $Info);
142  }
143 
144  # return info to caller
145  return $Info;
146  }
147 
153  public function GetFormats()
154  {
155  # query server for XML text
156  $XmlText = $this->PerformQuery("ListMetadataFormats");
157  $this->DebugOutVar(8, __METHOD__, "XmlText", htmlspecialchars($XmlText));
158 
159  # convert XML text into object
160  $Xml = simplexml_load_string($XmlText);
161  $this->DebugOutVar(9, __METHOD__, "Xml", $Xml);
162 
163  # if format info was found
164  $Formats = array();
165  if (isset($Xml->ListMetadataFormats->metadataFormat))
166  {
167  # extract info
168  $Index = 0;
169  foreach ($Xml->ListMetadataFormats->metadataFormat as $Format)
170  {
171  $this->GetValFromXml(
172  $Format, "metadataPrefix", "Name", $Formats[$Index]);
173  $this->GetValFromXml(
174  $Format, "schema", "Schema", $Formats[$Index]);
175  $this->GetValFromXml(
176  $Format, "metadataNamespace", "Namespace",
177  $Formats[$Index]);
178  $Index++;
179  }
180  }
181 
182  # return info to caller
183  return $Formats;
184  }
185 
193  public function GetRecords($StartDate = NULL, $EndDate = NULL)
194  {
195  # if we're using a cache directory, figure out which file
196  # should contain this set of records
197  if ($this->Cache !== NULL)
198  {
199  $cache_fname = sprintf("%s/%010x",
200  $this->Cache,
201  $this->CacheSequenceNumber);
202  $this->CacheSequenceNumber++;
203  }
204 
205  # when we're not using a cache or don't have a cached copy of
206  # this set of records, query the OAI provider to get it
207  if ($this->Cache === NULL || !file_exists($cache_fname) )
208  {
209  # if we have resumption token from prior query
210  if (isset($this->ResumptionToken))
211  {
212  # use resumption token as sole argument
213  $Args["resumptionToken"] = $this->ResumptionToken;
214  }
215  else
216  {
217  # set up arguments for query
218  $Args["metadataPrefix"] = $this->MetadataPrefix;
219  if ($StartDate) { $Args["from"] = $StartDate; }
220  if ($EndDate) { $Args["until"] = $EndDate; }
221  if ($this->SetSpec) { $Args["set"] = $this->SetSpec; }
222  }
223 
224  # query server for XML text
225  $XmlText = $this->PerformQuery("ListRecords", $Args);
226 
227  # if a cache is in use, save this chunk of XML into it
228  if ($this->Cache !== NULL)
229  {
230  file_put_contents($cache_fname, $XmlText);
231  }
232  }
233  else
234  {
235  # get XML text from the cache
236  $XmlText = file_get_contents($cache_fname);
237  }
238 
239  $this->DebugOutVar(8, __METHOD__, "XmlText", htmlspecialchars($XmlText));
240 
241  return $this->GetRecordsFromXML($XmlText, "ListRecords");
242  }
243 
258  public function GetRecord($Id)
259  {
260  $Args["metadataPrefix"] = $this->MetadataPrefix;
261  $Args["identifier"] = $Id;
262 
263  # query server for XML text
264  $XmlText = $this->PerformQuery("GetRecord", $Args);
265  $this->DebugOutVar(8, __METHOD__, "XmlText", htmlspecialchars($XmlText));
266 
267  return $this->GetRecordsFromXML($XmlText, "GetRecord");
268  }
269 
275  public function MoreRecordsAvailable()
276  {
277  return isset($this->ResumptionToken) ? TRUE : FALSE;
278  }
279 
283  public function ResetRecordPointer()
284  {
285  unset($this->ResumptionToken);
286  $this->CacheSequenceNumber = 0;
287  }
288 
294  public function SetDebugLevel($NewLevel)
295  {
296  $this->DebugLevel = $NewLevel;
297  }
298 
299 
300  # ---- PRIVATE INTERFACE -------------------------------------------------
301 
302  private $ServerUrl;
303  private $MetadataPrefix;
304  private $SetSpec;
305  private $DebugLevel;
306  private $ResumptionToken;
307  private $Cache;
308  private $CacheSequenceNumber;
309 
316  private function PerformQuery($QueryVerb, $Args = NULL)
317  {
318  # open stream to OAI server
319 
320  if (strpos($this->ServerUrl, "?") === FALSE)
321  {
322  $QueryUrl = $this->ServerUrl."?verb=".$QueryVerb;
323  }
324  else
325  {
326  $QueryUrl = $this->ServerUrl."&verb=".$QueryVerb;
327  }
328 
329  if ($Args)
330  {
331  foreach ($Args as $ArgName => $ArgValue)
332  {
333  $QueryUrl .= "&".urlencode($ArgName)."=".urlencode($ArgValue);
334  }
335  }
336  $FHndl = fopen($QueryUrl, "r");
337 
338  # if stream was successfully opened
339  $Text = "";
340  if ($FHndl !== FALSE)
341  {
342  # while lines left in response
343  while (!feof($FHndl))
344  {
345  # read line from server and add it to text to be parsed
346  $Text .= fread($FHndl, 10000000);
347  }
348  }
349 
350  # close OAI server stream
351  fclose($FHndl);
352 
353  # return query result data to caller
354  return $Text;
355  }
356 
364  private function GetValFromXml($Xml, $SrcName, $DstName, &$Results)
365  {
366  if (isset($Xml->$SrcName))
367  {
368  $Results[$DstName] = trim($Xml->$SrcName);
369  }
370  }
371 
379  private function DebugOutVar($Level, $MethodName, $VarName, $VarValue)
380  {
381  if ($this->DebugLevel >= $Level)
382  {
383  print("\n<pre>".$MethodName."() ".$VarName." = \n");
384  print_r($VarValue);
385  print("</pre>\n");
386  }
387  }
388 
389  // @codingStandardsIgnoreStart
390  /*
391  * Pull records out of an XML DOMNode.
392  *
393  * Data converted from XML will be added to
394  * $Records[$Index][$Section], with the XML from the DOM node
395  * flattened. For example, if we were to call
396  * ExtractDataFromXml($Records, 0, $dom, "metadata") with $dom
397  * pointing to XML like this and $Records initially empty:
398  *
399  * @code
400  * <record xmlns="http://ns.nsdl.org/ncs/lar"
401  * xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
402  * xsi:schemaLocation="http://ns.nsdl.org/ncs/lar http://ns.nsdl.org/ncs/lar/1.00/schemas/lar.xsd">
403  * <recordID>2200/20121012134026795T</recordID>
404  * <recordDate>2012-07-24</recordDate>
405  * <identifier>http://chemteacher.chemeddl.org/services/chemteacher/index.php?option=com_content&amp;view=article&amp;id=77</identifier>
406  * <title>ChemTeacher: Periodic Table Resource Pak</title>
407  * <license>
408  * <name URL="http://creativecommons.org/licenses/by-sa/3.0/">Creative commons:Attribution share alike (by-sa)</name>
409  * <property>Attribution required</property>
410  * <property>Educational use only</property>
411  * <property>Share alike required</property>
412  * </license>
413  * </record>
414  * @endcode
415  *
416  * After the call, print_r($Records) would produce something like:
417  * @code
418  * Array
419  * (
420  * [0] => Array
421  * (
422  * [metadata] => Array
423  * (
424  * [recordID] => Array ( [0] => 2200/20121012134026795T )
425  * [recordDate] => Array ( [0] => 2012-07-24 )
426  * [identifier] => Array
427  * (
428  * [0] => http://chemteacher.chemeddl.org/services/chemteacher/index.php?option=com_content&view=article&id=77
429  * )
430  * [title] => Array ( [0] => ChemTeacher: Periodic Table Resource Pak )
431  * [license/name] => Array ( [0] => Creative commons:Attribution share alike (by-sa) )
432  * [license/property] => Array
433  * (
434  * [0] => Attribution required
435  * [1] => Educational use only
436  * [2] => Share alike required
437  * )
438  * )
439  * )
440  * )
441  * @endcode
442  *
443  * @param array $Records to place data in.
444  * @param int $Index record number to populate
445  * @param DOMNode $dom to extract data from
446  * @param string $Section section of the record to populate (e.g.,
447  * metadata, about)
448  * @param string $ParentTagName parent tag or null for the root of
449  * this record, should only be non-null when called recurisvely
450  * (OPTIONAL, default NULL)
451  */
452  private function ExtractDataFromXml(&$Records, $Index, DOMNode $dom,
453  $Section, $ParentTagName=NULL)
454  {
455  foreach ($dom->childNodes as $node)
456  {
457  # for DOM children that are elements (rather than comments, text,
458  # or something else)
459  if ($node->nodeType == XML_ELEMENT_NODE)
460  {
461  # compute a tag name to use
462  $StorageTagName =
463  (($ParentTagName!==NULL) ? $ParentTagName."/" : "")
464  .$node->nodeName ;
465 
466  # Glue together the contents of the 'text' children of this node
467  $Value = "";
468  foreach ($node->childNodes as $child)
469  {
470  if ($child->nodeType == XML_TEXT_NODE)
471  {
472  $Value .= $child->nodeValue;
473  }
474  }
475 
476  # if we had a non-empty value, add it to the results
477  if (strlen(trim($Value))>0)
478  {
479  $Records[$Index][$Section][$StorageTagName][]= $Value;
480  }
481 
482  # and process our children
483  $this->ExtractDataFromXml($Records, $Index,
484  $node, $Section, $StorageTagName);
485  }
486  }
487  }
488  // @codingStandardsIgnoreEnd
489 
496  private function GetFirstElement(DOMNode $dom)
497  {
498  foreach ($dom->childNodes as $child)
499  {
500  if ($child->nodeType == XML_ELEMENT_NODE)
501  {
502  return $child;
503  }
504  }
505 
506  return NULL;
507  }
508 
525  private function GetRecordsFromXML($XmlText, $ParseTo)
526  {
527  # create XML parser and pass it text
528  $Xml = simplexml_load_string($XmlText);
529 
530  # if text could not be parsed, return NULL
531  if (! $Xml instanceof SimpleXmlElement )
532  {
533  return NULL;
534  }
535 
536  # set up vars to hold our results
537  $Records = array();
538  $Index = 0;
539 
540  # we'll want to find our records with XPath, so we need to
541  # register a prefix for the oai elements
542  $Xml->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/");
543 
544  # extract records, iterate over them
545  $RecordXML = $Xml->xpath("oai:".$ParseTo."//oai:record");
546  foreach ($RecordXML as $Record)
547  {
548  # pull relevant information out of the header
549  #
550  # Note that SimpleXMLElement objects map elements onto PHP
551  # object properties, and will return a SimpleXMLElement w/o
552  # any associated XML for non-existent elements. So,
553  # nothing explodes when we ask the Record for an element it
554  # did not contain.
555  #
556  # However, SimpleXMLElements w/o associated XML return
557  # 'NULL' for all properties. Therefore, if we tried to
558  # look at the grandchild of a non-existent element it would
559  # be problematic. In the cases below, we get empty
560  # strings when the children of 'header' &c are empty, which
561  # is what we want anyway.
562 
563  $Records[$Index]["identifier"] = (string)$Record->header->identifier;
564  $Records[$Index]["datestamp"] = (string)$Record->header->datestamp;
565 
566  # grab associated meadata (if there is any)
567  if ($Record->metadata->count() > 0)
568  {
569  # to avoid frustrations with namespaces and SimpleXML, use
570  # DOMDocument to parse the record data
571  $doc = dom_import_simplexml( $Record->metadata );
572 
573  # get the 'record' element
574  $doc = $this->GetFirstElement( $doc );
575 
576  # record the format used for this record
577  $Records[$Index]["format"] = $doc->nodeName;
578 
579  # extract data for this record
580  $this->ExtractDataFromXml( $Records, $Index, $doc, "metadata" );
581  }
582 
583  # if there is additional information available, snag that too
584  if ($Record->about->count() > 0)
585  {
586  $doc = dom_import_simplexml( $Record->about );
587  $this->ExtractDataFromXml($Records, $Index, $doc, "about");
588  }
589 
590  # move along to the next record
591  $Index++;
592  }
593 
594  # look for resumption token and save if found (as above, we'll
595  # get an empty string if either ListRecords or resumptionToken
596  # are absent)
597  $Token = (string)$Xml->ListRecords->resumptionToken;
598 
599  if (strlen($Token)>0)
600  {
601  $this->ResumptionToken = $Token;
602  }
603  else
604  {
605  unset($this->ResumptionToken);
606  }
607 
608  # return records to caller
609  return $Records;
610  }
611 }
ResetRecordPointer()
Clear any additional records available after last GetRecords().
Definition: OAIClient.php:283
ServerUrl($NewValue=NULL)
Get or set URL of target OAI repository server.
Definition: OAIClient.php:79
GetRecord($Id)
Get a single record from a repositry server.
Definition: OAIClient.php:258
__construct($ServerUrl, $Cache=NULL)
Class constructor.
Definition: OAIClient.php:49
MoreRecordsAvailable()
Check whether more records are available after last GetRecords().
Definition: OAIClient.php:275
GetRecords($StartDate=NULL, $EndDate=NULL)
Retrieve records from repository server.
Definition: OAIClient.php:193
MetadataPrefix($NewValue=NULL)
Get or set metadata schema for records being retrieved.
Definition: OAIClient.php:93
SetSpec($NewValue="X-NOSETSPECVALUE-X")
Get or set specification of subset of records to be retrieved.
Definition: OAIClient.php:107
GetIdentification()
Retrieve identification information from repository server.
Definition: OAIClient.php:123
SetDebugLevel($NewLevel)
Set current debug output level.
Definition: OAIClient.php:294
GetFormats()
Retrieve list of available metadata formats from repository server.
Definition: OAIClient.php:153