Current File : //opt/RZphp73/includes/XML/NITF.php |
<?php
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
/**
* NITF XML Parser
*
* PHP versions 4 and 5
*
* LICENSE: This source file is subject to version 3.0 of the PHP license
* that is available through the world-wide-web at the following URI:
* http://www.php.net/license/3_0.txt. If you did not receive a copy of
* the PHP License and are unable to obtain it through the web, please
* send a note to license@php.net so we can mail you a copy immediately.
*
* @category XML
* @package XML_NITF
* @author Patrick O'Lone <polone@townnews.com>
* @copyright 1997-2005 The PHP Group
* @license http://www.php.net/license/3_0.txt PHP License 3.0
* @version CVS: $Id: NITF.php 304667 2010-10-24 01:49:35Z clockwerx $
* @link http://pear.php.net/package/XML_NITF/
*/
/**
* Include the XML_Parser class as the base class
*/
require_once ('XML/Parser.php');
// {{{ XML_NITF
/**
* Simple NITF Parser
*
* This class provides basic NITF parsing. Many of the major elements of the NITF
* standard are supported. This implementation is based off the NITF 3.1 DTD,
* publicly available at the following URL:
*
* http://www.nitf.org/site/nitf-documentation/nitf-3-1.dtd
*
* Note that not all elements of this standard are not supported.
*
* <sample>
* <?php
*
* require_once("XML/NITF.php");
*
* $oNITF =& new XML_NITF();
* $oNITF->setInputFile("nitf.xml");
* $xResult = $oNITF->parse();
* if (PEAR::isError($xResult)) {
* die("Parsing failed: ".$xResult->getMessage());
* }
*
* echo $oNITF->getHeadline();
* echo $oNITF->getByline();
*
* ?>
* </sample>
*
* @category XML
* @package XML_NITF
* @author Patrick O'Lone <polone@townnews.com>
* @copyright 1997-2005 The PHP Group
* @license http://www.php.net/license/3_0.txt PHP License 3.0
* @version Release: 1.0.2
* @link http://pear.php.net/package/XML_NITF
*/
class XML_NITF extends XML_Parser
{
// {{{ properties
/**
* Meta tag properties retrieved from document head section
* @see getMetaData()
* @var array
* @access private
*/
var $m_kMeta = array();
/**
* Document Metadata
*
* Container for metadata information about this particular document.
*
* @see getDocData()
* @var array
* @access private
*/
var $m_kDocData = array ('key-list' => array ());
/**
* Specific Publication Data
*
* Information about specific instance of an item's publication. Contains
* metadata about how the particular news object was used in a specific
* instance.
*
* @see getPubData()
* @var array
* @access private
*/
var $m_kPubData = array ();
/**
* Document Revisions
*
* Information about the creative history of the document; also used as an
* audit trail. Includes who made changes, when the changes were made, and
* why. Each element of the array is a key-based array that corresponds to
* the <revision-history> element.
*
* @var array
* @see getRevision()
* @access private
*/
var $m_akRevisions = array ();
/**
* Document Headlines
*
* The various headlines that were found in the document. The headlines are
* keyed by the levels of HLX. The default hedline (if no level is found) is
* HL1.
* @var array
* @see getHedlines()
* @access private
*/
var $m_kHedlines = array ('HL1' => null, 'HL2' => array ());
/**
* Abstract
*
* Story abstract summary or synopsis of the contents of the document.
* @var string
* @access private
*/
var $m_sAbstract = null;
/**
* @var string
* Significant place mentioned in an article. Used to normalize locations.
* The location in this variable is the place where the story's events will
* or have unfolded.
* @access private
*/
var $m_sLocation = null;
/**
* @var string
* Information distributor. May or may not be the owner or creator.
* @access private
*/
var $m_sDistributor = null;
/**
* @var string
* The elements of the byline, including the author's name and title.
* @see getByline()
* @access private
*/
var $m_kByline = array ('author' => null, 'title' => null);
/**
* @var array
* An array of paragraphs extracted from the document
* @see getLede(), getContent()
* @access private
*/
var $m_aContent = array ();
/**
* @var array
* A list of media reference elements as found in the body section of the
* document. Each element is an array itself with keyed properties related
* to media element in question.
* @see getMedia()
* @access private
*/
var $m_aMedia = array ();
/**
* @var array
* A list of tags that were parsed (in order) denoting the current sequence
* of tags that were parsed. This is array is used for parsing the document
* elements in a particular order (if needed).
* @see StartHandler(), EndHandler(), cdataHandler()
* @access private
*/
var $m_aParentTags = array ();
/**
* A byline at the end of a story. Example: Stuart Myles contributed to this
* article.
* @var string
* @see getTagline()
* @access private
*/
var $m_sTagline = null;
/**
* Free-form bibliographic data. Used to elaborate on the source of
* information.
* @var string
* @see getBibliography()
* @access private
*/
var $m_sBibliography = null;
// }}}
// {{{ getDocData()
/**
* Access all or specific elements of the <docdata> block
*
* @param string $sProperty The property of the <docdata> block to return, the
* most common being:
* +"doc-id" - a unique identifier of this document
* (string)
* +"key-list" - a list of keywords provided with
* the document (array)
* +"copyright" - the copyright holder (string)
* +"series" - if the document is part of series
* (string)
* +"urgency" - a number between 1 (urgent) and 8
* (not urgent) (integer)
* +"date.issue" - date the document was issued
* (UNIX timestamp)
* +"date.release" - date the document is publicly
* available (UNIX timestamp)
* +"date.expires" - date the document is no longer
* valid (UNIX timestamp)
* +"management-status" - this document's current
* workflow status
*
* @return mixed All of the elements from the <docdata> block will be returned
* if a specific property is not provided. If a specific property
* is requested and is found in the docdata block, then that
* property will be returned. If the property cannot be found,
* null is returned.
*
* @see getDocDataElement()
* @access public
*/
function getDocData($sProperty = null)
{
if (!empty ($sProperty)) {
$sProperty = strtolower($sProperty);
if (isset ($this->m_kDocData[$sProperty])) {
return $this->m_kDocData[$sProperty];
}
return null;
}
return $this->m_kDocData;
}
// }}}
// {{{ getMetaData()
/**
* Retrieve meta data from the NITF file
* @return array Returns an array of key/value pairs from the meta section
* @access public
*/
function getMetaData()
{
return $this->m_kMeta;
}
// }}}
// {{{ getPubData()
/**
* Returns all elements or a specific element from the <pubdata> block
*
* @param string $sProperty The publication property being retrieved
* @return mixed Returns string, numeric, or array values depending on the
* property being accessed from the <pubdata> block.
*
* @access public
*/
function getPubData($sProperty = null)
{
if (!empty ($sProperty)) {
$sProperty = strtolower($sProperty);
if (isset ($this->m_kPubData[$sProperty])) {
return $this->m_kPubData[$sProperty];
}
return null;
}
return $this->m_kPubData;
}
// }}}
// {{{ getRevision()
/**
* Get the revision history
*
* @return array An array containing key-value arrays. The properties of each
* array element in this array are:
*
* +"comment" - Reason for the revision
* +"function" - Job function of individual performing revision
* +"name" - Name of the person who made the revision
* +"norm" - Date of the revision
* @access public
*/
function getRevision()
{
return $this->m_akRevisions;
}
// }}}
// {{{ getHeadline()
/**
* Retrieve all headlines or a single headline denoted by key
*
* @param integer $nLevel The key value corresponding to the headline to be
* retrieved
* @return mixed Returns an array if no specific headline element is requested,
* or a string if the specific headline element requested exists
* @access public
*/
function getHeadline($nLevel = 1)
{
return $this->m_kHedlines["HL$nLevel"];
}
// }}}
// {{{ getByline()
/**
* Return information about the author of a document
*
* @param string $sProperty The field of the byline to retrieve.
* @return string The entire byline as we found in the document
* @access public
*/
function getByline($sProperty = 'author')
{
$sProperty = strtolower($sProperty);
if (isset ($this->m_kByline[$sProperty])) {
return $this->m_kByline[$sProperty];
}
return null;
}
// }}}
// {{{ getMedia()
/**
* Query for a list of related media elements
*
* @param string $sProperty If supplied, only this property will be returned
* for each element of the media reference array.
* @return array Returns an array of all media reference data, or an array of
* select media reference data determined by the property
* parameter passed.
* @access public
*/
function getMedia($sProperty = null)
{
if (empty ($sProperty)) {
return $this->m_aMedia;
} else {
$aMediaRefs = array ();
foreach ($this->m_aMedia as $aMediaElem) {
if (isset ($aMediaElem[$sProperty])) {
array_push($aMediaRefs, $aMediaElem[$sProperty]);
}
}
return $aMediaRefs;
}
}
// }}}
// {{{ getLede()
/**
* Returns the lede (sometimes called lead) paragraph
*
* @return string Returns the lede paragraph if it is defined, or null otherwise
* @access public
*/
function getLede()
{
if (isset ($this->m_aContent[0])) {
return $this->m_aContent[0];
}
return null;
}
// }}}
// {{{ getContent()
/**
* Returns the paragraphs of content
*
* @return array An array of elements that represent a single paragraph each
* @access public
*/
function & getContent()
{
return $this->m_aContent;
}
// }}}
// {{{ getTagLine()
/**
* Returns the tag line (if one exists)
*
* @return string The tag line extracted from the NITF data source
* @access public
*/
function getTagline()
{
return $this->m_sTagline;
}
// }}}
// {{{ getBibliography()
/**
* Returns the free-form bibliographic data
*
* @return string The bibliography (if one exists) is returned
* @access public
*/
function getBibliography()
{
return $this->m_sBibliography;
}
// }}}
// {{{ toString()
/**
* Get a string version of the article
*
* @param string $sCRLF The character(s) used to separate each article
* element in the string that is returned - often
* referred to as the CRLF.
* @return string A string representing the main headline, author, content,
* and tagline.
* @access public
*/
function & toString($sCRLF = "\n")
{
$sArticle = "{$this->m_kHedlines['HL1']}$sCRLF";
if (!empty ($this->m_kByline['author'])) {
$sArticle .= "{$this->m_kByline['author']}$sCRLF";
}
if (!empty ($this->m_sLocation)) {
$sArticle .= "{$this->m_sLocation} - ";
}
$sArticle .= join($sCRLF, $this->m_aContent);
if (!empty ($this->m_sTagline)) {
$sArticle .= "$sCRLF{$this->m_sTagline}";
}
return $sArticle;
}
// }}}
// {{{ StartHandler()
/**
* Handle start XML elements and attributes
*
* @param object $oParser The XML parser object instance that was inherited
* from the XML_Parser class
* @param string $sName A tag element from the XML data stream
* @param array $kAttrib An array of XML attributes associated with the given
* tag supplied
* @return void
* @access private
*/
function StartHandler($oParser, $sName, $kAttrib)
{
// Push the element into the stack of XML elements already visited
array_push($this->m_aParentTags, $sName);
// Handle the attributes of the XML tags
switch ($sName) {
case 'HL2' :
$this->_sHedline = null;
break;
case 'P' :
if (!empty ($kAttrib['LEDE']) && ($kAttrib['LEDE'] == 'true')) {
$this->_bIsLede = true;
}
$this->_sContent = null;
break;
case 'DOC.COPYRIGHT' :
$this->m_kDocData['copyright'] = $kAttrib['HOLDER'];
break;
case 'DOCDATA':
if(!empty($kAttrib['MANAGEMENT-STATUS'])) {
$this->m_kDocData['management-status'] = $kAttrib['MANAGEMENT-STATUS'];
}
break;
case 'MEDIA' :
$this->_kMedia = array ();
if (!empty ($kAttrib['MEDIA-TYPE'])) {
$this->_kMedia['type'] = $kAttrib['MEDIA-TYPE'];
} else {
$this->_kMedia['type'] = 'other';
}
$this->_kMedia['source'] = null;
$this->_kMedia['mime-type'] = null;
$this->_kMedia['caption'] = null;
$this->_kMedia['data'] = null;
$this->_kMedia['encoding'] = null;
$this->_kMedia['producer'] = null;
$this->_kMedia['meta'] = array ();
break;
case 'MEDIA-REFERENCE' :
if (!empty ($kAttrib['SOURCE'])) {
$this->_kMedia['source'] = $kAttrib['SOURCE'];
// Compatibility with the AP Usenet feed - note that this is a non
// standard attribute and is NOT a part of NITF standards
}
elseif (!empty ($kAttrib['DATA-LOCATION'])) {
$this->_kMedia['source'] = $kAttrib['DATA-LOCATION'];
}
$this->_kMedia['mime-type'] = $kAttrib['MIME-TYPE'];
break;
case 'MEDIA-OBJECT' :
$this->_kMedia['encoding'] = $kAttrib['ENCODING'];
break;
case 'MEDIA-METADATA' :
if (!empty ($kAttrib['NAME'])) {
$this->_kMedia[$kAttrib['NAME']] = $kAttrib['VALUE'];
}
break;
case 'PUBDATA' :
foreach ($kAttrib as $sKey => $sValue) {
$this->m_kPubData[strtolower($sKey)] = $sValue;
}
break;
case 'DOC-ID' :
$this->m_kDocData['doc-id'] = $kAttrib['ID-STRING'];
break;
// NITF 3.0 extension - added per request by Lars Schenk
// (info@lars-schenk.de). Document urgency status information.
case 'URGENCY' :
$this->m_kDocData['urgency'] = $kAttrib['ED-URG'];
break;
// The list of keywords or phrases are just added to the array of
// keywords.
case 'KEYWORD' :
if (empty ($this->m_kDocData['key-list'])) {
$this->m_kDocData['key-list'] = array ();
}
array_push($this->m_kDocData['key-list'], $kAttrib['KEY']);
break;
// The release, expiration, and issuing dates of this article. The
// ISO-8601 time stamp settings are preserved, but you can use the
// magic function strtotime() to convert these to time stamp values.
case 'DATE.RELEASE' :
case 'DATE.EXPIRE' :
case 'DATE.ISSUE' :
if (!empty ($kAttrib['NORM'])) {
$sName = strtolower($sName);
$this->m_kDocData[$sName] = $kAttrib['NORM'];
}
break;
case 'REVISION-HISTORY' :
array_push($this->m_akRevisions, array_change_key_case($kAttrib, CASE_LOWER));
break;
case 'META':
if (!empty($kAttrib['NAME']) && isset($kAttrib['CONTENT'])) {
$sName = strtolower($kAttrib['NAME']);
$this->m_kMeta[$sName] = $kAttrib['CONTENT'];
}
break;
}
}
// }}}
// {{{ EndHandler()
/**
* Handle XML tag closing state
*
* @param object $oParser The parser object parsing the XML data
* @param string $sName The name of the tag element that has just ended
* @return void
* @access private
*/
function EndHandler($oParser, $sName)
{
switch ($sName) {
case 'HL1' :
$this->m_kHedlines['HL1'] = trim($this->m_kHedlines['HL1']);
break;
case 'HL2' :
array_push($this->m_kHedlines['HL2'], trim($this->_sHedline));
unset ($this->_sHedline);
break;
case 'P' :
if (isset ($this->_bIsLede)) {
array_unshift($this->m_aContent, trim($this->_sContent));
unset ($this->_bIsLede);
} else {
array_push($this->m_aContent, trim($this->_sContent));
}
unset ($this->_sContent);
break;
case 'MEDIA' :
array_push($this->m_aMedia, $this->_kMedia);
unset ($this->_kMedia);
break;
}
array_pop($this->m_aParentTags);
}
// }}}
// {{{ cdataHandler()
/**
* Parses CDATA chunks
*
* @param object $oParser The XML parser instance inherited from the
* XML_Parser class
* @param string $sData The data chunk to be processed from the parser
* @return void
* @access private
*/
function cdataHandler($oParser, $sData)
{
if (!in_array('MEDIA-OBJECT', $this->m_aParentTags)) {
$sData = preg_replace('#\s+#', ' ', $sData);
}
// Elements that can be found in the BODY.HEAD section of the NITF
// document are defined in this handler.
if (in_array('BODY.HEAD', $this->m_aParentTags)) {
// We don't care if they use other attribute items, we just want the
// textual version of the byline. Other attributes are appended to
// the byline data.
if (in_array('BYLINE', $this->m_aParentTags)) {
if (in_array('BYTTL', $this->m_aParentTags)) {
$this->m_kByline['title'] .= $sData;
return;
}
$this->m_kByline['author'] .= $sData;
return;
}
// Generally, the distributor is the same as the company supplying
// the content. However, this is not always the case (the AP, for
// example).
if (in_array('DISTRIBUTOR', $this->m_aParentTags)) {
$this->m_sDistributor .= $sData;
return;
}
// The location where the story pertains too.
if (in_array('DATELINE', $this->m_aParentTags)) {
if (in_array('LOCATION', $this->m_aParentTags)) {
$this->m_sLocation .= $sData;
}
return;
}
// There are only two possibilities for hedlines, the main headline
// or a subheadline.
if (in_array('HEDLINE', $this->m_aParentTags)) {
if (in_array('HL2', $this->m_aParentTags)) {
$this->_sHedline .= $sData;
} else {
$this->m_kHedlines['HL1'] .= $sData;
}
}
return;
}
// The article content, including the lead and following paragraphs, can
// be found in this section of the XML document.
if (in_array('BODY.CONTENT', $this->m_aParentTags)) {
if (in_array('MEDIA', $this->m_aParentTags)) {
// The media caption for the currently selected media element.
if (in_array('MEDIA-CAPTION', $this->m_aParentTags)) {
$this->_kMedia['caption'] .= $sData;
return;
}
if (in_array('MEDIA-OBJECT', $this->m_aParentTags)) {
$this->_kMedia['data'] .= $sData;
return;
}
}
// A paragraph element was found.
if (in_array('P', $this->m_aParentTags)) {
$this->_sContent .= $sData;
return;
}
// Headlines broken up in the main content should be added back in
// to the sub-headline array. This can be used to create an index.
if (in_array('HL2', $this->m_aParentTags)) {
$this->_sHedline .= $sData;
}
}
// The <body.end> tag has two primary elements, <taglines> and the free
// form <bibliography> tags.
if (in_array('BODY.END', $this->m_aParentTags)) {
if (in_array('TAGLINE', $this->m_aParentTags)) {
$this->m_sTagline .= $sData;
return;
}
if (in_array('BIBLIOGRAPHY', $this->m_aParentTags)) {
$this->m_sBibliography .= $sData;
}
}
}
// }}}
}
// }}}
/*
* Local variables:
* tab-width: 4
* c-basic-offset: 4
* c-hanging-comment-ender-p: nil
* End:
*/
?>