parser_common_syndication.module

Tracking 5.x-1.x branch
  1. drupal
    1. 5 contributions/feedapi/parser_common_syndication/parser_common_syndication.module
    2. 6 contributions/feedapi/parser_common_syndication/parser_common_syndication.module

Parse the incoming URL with SimpleXML then provide a data structure of the feed. Requires PHP5 because of SimpleXML.

Functions & methods

NameDescription
parser_common_syndication_feedapi_feedImplementation of hook_feedapi_feed().
parser_common_syndication_helpImplementation of hook_help().
parser_common_syndication_nodeapiDelete cache validating functions when feed is deleted
parser_common_syndication_requirementsImplementaton of hook_requirements().
_parser_common_syndication_atom10_parseParse atom feeds.
_parser_common_syndication_cache_getGet the cached version of the <var>$url</var>
_parser_common_syndication_cache_setStore the parsed feed into the cache
_parser_common_syndication_downloadCall one of the possible feedapi_get hook and pass back the downloaded data
_parser_common_syndication_feedapi_getGet the content from the given URL.
_parser_common_syndication_feedapi_parseParse the feed into a data structure.
_parser_common_syndication_feed_format_detectDetermine the feed format of a SimpleXML parsed object structure.
_parser_common_syndication_linkExtract the link that points to the original content (back to site or origi
_parser_common_syndication_parse_dateParse a date comes from a feed.
_parser_common_syndication_parse_w3cdtfParse the W3C date/time format, a subset of ISO 8601.
_parser_common_syndication_RDF10_parseParse RSS1.0/RDF feeds.
_parser_common_syndication_RSS20_parseParse RSS2.0 feeds.
_parser_common_syndication_sanitize_cacheSet the default caching directory if the current setting is not useable
_parser_common_syndication_titlePrepare raw data to be a title

File

View source
  1. <?php
  2. /**
  3. * @file
  4. * Parse the incoming URL with SimpleXML then provide a data structure of the feed.
  5. * Requires PHP5 because of SimpleXML.
  6. */
  7. /**
  8. * Implementation of hook_help().
  9. */
  10. function parser_common_syndication_help($section) {
  11. switch ($section) {
  12. case 'admin/modules#description':
  13. return t('Provide a common syndication parser for FeedAPI-compatible modules. Only PHP5-compatible. Rather fast.');
  14. case 'feedapi/full_name':
  15. return t('Parser Common Syndication - only for PHP5');
  16. }
  17. }
  18. /**
  19. * Implementation of hook_feedapi_feed().
  20. */
  21. function parser_common_syndication_feedapi_feed($op) {
  22. $args = func_get_args();
  23. switch ($op) {
  24. case 'type':
  25. return array("XML feed");
  26. case 'compatible':
  27. if (!function_exists('simplexml_load_string')) {
  28. return FALSE;
  29. }
  30. $url = $args[1]->url;
  31. $downloaded_string = _parser_common_syndication_download($url, $op);
  32. if (is_object($downloaded_string)) {
  33. return array_shift(parser_common_syndication_feedapi_feed('type'));
  34. }
  35. if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
  36. @ $xml = simplexml_load_string($downloaded_string, NULL);
  37. }
  38. else {
  39. @ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING);
  40. }
  41. if (_parser_common_syndication_feed_format_detect($xml) != FALSE) {
  42. // The parser is compatible. Then has to parse the feed and cache it. Because in the download
  43. // part, the feed etag data be already saved perhaps (depends on the webserver).
  44. $parsed_feed = _parser_common_syndication_feedapi_parse($xml);
  45. if (is_object($parsed_feed) && $parsed_feed->from_cache !== TRUE) {
  46. _parser_common_syndication_cache_set($url, $parsed_feed);
  47. }
  48. // We don't have to choose between the types, because this module is only able to parse one.
  49. return array_shift(parser_common_syndication_feedapi_feed('type'));
  50. }
  51. return FALSE;
  52. case 'parse':
  53. $feed = is_object($args[1]) ? $args[1] : FALSE;
  54. $parsed_feed = _parser_common_syndication_feedapi_parse($feed);
  55. if (is_object($parsed_feed) && $parsed_feed->from_cache !== TRUE) {
  56. _parser_common_syndication_cache_set($feed->url, $parsed_feed);
  57. }
  58. return $parsed_feed;
  59. }
  60. }
  61. /**
  62. * Implementaton of hook_requirements().
  63. */
  64. function parser_common_syndication_requirements() {
  65. $t = get_t();
  66. if (!version_compare(5, PHP_VERSION, '<=') || !function_exists('simplexml_load_file')) {
  67. return array('Parser Common Syndication' => array('title' => t('SimpleXML library.'), 'description' => t('A fast XML parsing library. (From PHP5)'), 'severity' => REQUIREMENT_ERROR, 'value' => $t('Missing')));
  68. }
  69. return array('Parser Common Syndication' => array('title' => t('SimpleXML library.'), 'description' => t('A fast XML parsing library. (From PHP5)'), 'severity' => REQUIREMENT_OK, 'value' => $t('Available')));
  70. }
  71. /**
  72. * Parse the feed into a data structure.
  73. *
  74. * @param $feed
  75. * The feed object (contains the URL or the parsed XML structure)
  76. * @return stdClass
  77. * The structured datas extracted from the feed
  78. */
  79. function _parser_common_syndication_feedapi_parse($feed) {
  80. if (is_a($feed, 'SimpleXMLElement')) {
  81. $xml = $feed;
  82. }
  83. else {
  84. $downloaded_string = _parser_common_syndication_download($feed->url, 'parse');
  85. if ($downloaded_string === FALSE || is_object($downloaded_string)) {
  86. return $downloaded_string;
  87. }
  88. if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
  89. @ $xml = simplexml_load_string($downloaded_string, NULL);
  90. }
  91. else {
  92. @ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING);
  93. }
  94. // Got a malformed XML.
  95. if ($xml === FALSE || $xml == NULL) {
  96. return FALSE;
  97. }
  98. }
  99. $feed_type = _parser_common_syndication_feed_format_detect($xml);
  100. if ($feed_type == "atom1.0") {
  101. return _parser_common_syndication_atom10_parse($xml);
  102. }
  103. if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
  104. return _parser_common_syndication_RSS20_parse($xml);
  105. }
  106. if ($feed_type == "RDF") {
  107. return _parser_common_syndication_RDF10_parse($xml);
  108. }
  109. return FALSE;
  110. }
  111. /**
  112. * Get the cached version of the <var>$url</var>
  113. */
  114. function _parser_common_syndication_cache_get($url) {
  115. $cache_file = _parser_common_syndication_sanitize_cache() .'/'. md5($url);
  116. if (file_exists($cache_file)) {
  117. $file_content = file_get_contents($cache_file);
  118. return unserialize($file_content);
  119. }
  120. return FALSE;
  121. }
  122. /**
  123. * Store the parsed feed into the cache
  124. */
  125. function _parser_common_syndication_cache_set($url, $parsed_feed) {
  126. $cache_file = _parser_common_syndication_sanitize_cache() .'/'. md5($url);
  127. $cache_fp = fopen($cache_file, 'w');
  128. fwrite($cache_fp, serialize($parsed_feed));
  129. fclose($cache_fp);
  130. }
  131. /**
  132. * Get the content from the given URL.
  133. *
  134. * @param $url
  135. * A valid URL (not only web URLs).
  136. * @param $username
  137. * If the URL use authentication, here you can supply the username for this.
  138. * @param $password
  139. * If the URL use authentication, here you can supply the password for this.
  140. * @return
  141. * The data pulled from the URL or FALSE if the feed does not need refresh.
  142. */
  143. function _parser_common_syndication_feedapi_get($url, $username = NULL, $password = NULL) {
  144. $method = 'GET';
  145. $follow = 3;
  146. $data = NULL;
  147. // Only download and parse data if really needs refresh. Based on Last-Modified and If-Modified-Since
  148. $headers = array();
  149. $validate = db_fetch_array(db_query("SELECT etag, last_modified FROM {parser_common_syndication} WHERE url = '%s'", $url));
  150. if (!empty($validate['etag'])) {
  151. $headers['If-None-Match'] = $validate['etag'];
  152. }
  153. if (!empty($validate['last_modified'])) {
  154. $headers['If-Modified-Since'] = $validate['last_modified'];
  155. }
  156. if (!empty($username)) {
  157. $headers['Authorization'] = 'Basic '. base64_encode("$username:$password");
  158. }
  159. $result = drupal_http_request($url, $headers, $method, $data, $follow);
  160. // In this case return the cached data
  161. if ($result->code == 304) {
  162. $cached_data = _parser_common_syndication_cache_get($url);
  163. if (is_object($cached_data)) {
  164. $cached_data->from_cache = TRUE;
  165. return $cached_data;
  166. }
  167. else {
  168. // It's a tragedy, this file has to be exist and contains good data. In this case, repeat the stuff without cache
  169. db_query("DELETE FROM {parser_common_syndication} WHERE url = '%s'", $url);
  170. return _parser_common_syndication_feedapi_get($url, $username, $password);
  171. }
  172. }
  173. if (db_result(db_query("SELECT COUNT(*) FROM {parser_common_syndication} WHERE url = '%s'", $url)) == 0) {
  174. db_query("INSERT INTO {parser_common_syndication} (etag, last_modified, url) VALUES ('%s', '%s', '%s')", $result->headers['ETag'], $result->headers['Last-Modified'], $url);
  175. }
  176. else {
  177. db_query("UPDATE {parser_common_syndication} SET etag = '%s', last_modified = '%s' WHERE url = '%s'", $result->headers['ETag'], $result->headers['Last-Modified'], $url);
  178. }
  179. return $result->data;
  180. }
  181. /**
  182. * Delete cache validating functions when feed is deleted
  183. */
  184. function parser_common_syndication_nodeapi(&$node, $op) {
  185. if (isset($node->feed) || feedapi_enabled_type($node->type)) {
  186. switch ($op) {
  187. case 'delete':
  188. db_query("DELETE FROM {parser_common_syndication} WHERE url = '%s'", $node->feed->url);
  189. $cache_dir = _parser_common_syndication_sanitize_cache();
  190. $cache_filename = $cache_dir .'/'. md5($node->feed->url);
  191. if (file_exists($cache_filename)) {
  192. unlink($cache_filename);
  193. }
  194. break;
  195. }
  196. }
  197. }
  198. /**
  199. * Determine the feed format of a SimpleXML parsed object structure.
  200. *
  201. * @param $xml
  202. * SimpleXML-preprocessed feed.
  203. * @return
  204. * The feed format short description or FALSE if not compatible.
  205. */
  206. function _parser_common_syndication_feed_format_detect($xml) {
  207. if (!is_object($xml)) {
  208. return FALSE;
  209. }
  210. $attr = $xml->attributes();
  211. $type = strtolower($xml->getName());
  212. if (isset($xml->entry) && $type == "feed") {
  213. return "atom1.0";
  214. }
  215. if ($type == "rss" && $attr["version"] == "2.0") {
  216. return "RSS2.0";
  217. }
  218. if ($type == "rdf" && isset($xml->channel)) {
  219. return "RDF";
  220. }
  221. if ($type == "rss" && $attr["version"] == "0.91") {
  222. return "RSS0.91";
  223. }
  224. if ($type == "rss" && $attr["version"] == "0.92") {
  225. return "RSS0.92";
  226. }
  227. return FALSE;
  228. }
  229. /**
  230. * Call one of the possible feedapi_get hook and pass back the downloaded data
  231. *
  232. * @return
  233. * string - the downloaded data, FALSE - if the URL is not reachable
  234. */
  235. function _parser_common_syndication_download($url, $op) {
  236. if (valid_url($url, TRUE)) {
  237. // Handle password protected feeds.
  238. $url_parts = parse_url($url);
  239. $password = $username = NULL;
  240. if (!empty($url_parts['user'])) {
  241. $password = $url_parts['pass'];
  242. $username = $url_parts['user'];
  243. }
  244. }
  245. $downloaded_string = _parser_common_syndication_feedapi_get($url, $username, $password, $op);
  246. // Cannot get the feed, pass the problem to one level upper
  247. if ($downloaded_string == "") {
  248. return FALSE;
  249. }
  250. // The data comes from cache, just pass one level upper
  251. else if (is_object($downloaded_string)) {
  252. return $downloaded_string;
  253. }
  254. // Do the autodiscovery at this level, pass back the real data
  255. // Maybe it's HTML. If it's not HTML, not worth to take a look into the downloaded string
  256. if (strpos(strtolower($downloaded_string), "<html") !== FALSE) {
  257. $allowed_mime = array("text/xml", "application/rss+xml", "application/atom+xml", "application/rdf+xml", "application/xml");
  258. $matches = array();
  259. // Get all the links tag
  260. preg_match_all('/<link\s+(.*?)\s*\/?>/si', $downloaded_string, $matches);
  261. $links = $matches[1];
  262. $rss_link = FALSE;
  263. foreach ($links as $link) {
  264. $mime = array();
  265. // Get the type attribute and check if the mime type is allowed.
  266. preg_match_all('/type\s*=\s*("|' ."'". ')([A-Za-z\/+]*)("|' ."'". ')/si', $link, $mime);
  267. if (in_array(array_pop($mime[2]), $allowed_mime)) {
  268. $href = array();
  269. // Get the href attribute.
  270. preg_match_all('/href\s*=\s*("|' ."'". ')([=#\?_:.0-9A-Za-z\/+]*)("|' ."'". ')/si', $link, $href);
  271. $rss_link = array_pop($href[2]);
  272. if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) {
  273. // Handle base url related stuff.
  274. $parsed_url = parse_url($rss_link);
  275. if (!isset($parsed_url['host'])) {
  276. // It's relative so make it absolute.
  277. $base_tag = array();
  278. preg_match_all('/<base href\s*=\s*("|'. "'" .')([_:.0-9A-Za-z\/+]*)("|'. "'" .')/si', $link, $base_tag);
  279. $base_url = array_pop($base_tag[2]);
  280. if (is_string($base_url) && strlen($base_url) > 0) {
  281. // Get from the HTML base tag.
  282. $rss_link = $base_url . $rss_link;
  283. }
  284. else {
  285. // Guess from the original URL.
  286. $original_url = parse_url($url);
  287. $rss_link = $original_url['scheme'] .'://'. $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] .'?'. $parsed_url['query'] .'#'. $parsed_url['fragment'];
  288. }
  289. }
  290. $downloaded_string = _parser_common_syndication_download($rss_link, $op);
  291. break;
  292. }
  293. }
  294. }
  295. }
  296. // Filter out strange tags. Without this, the text would contain strange stuff.
  297. // @todo: make sure that these are not important for feed element mapper
  298. $downloaded_string = preg_replace(array('@<script[^>]*?.*?</script>@si', '@<object[^>]*?.*?</object>@si', '@<embed[^>]*?.*?</embed>@si', '@<applet[^>]*?.*?</applet>@si', '@<noframes[^>]*?.*?</noframes>@si', '@<noscript[^>]*?.*?</noscript>@si', '@<noembed[^>]*?.*?</noembed>@si'), array('', '', '', '', '', '', ''), $downloaded_string);
  299. // Ugly hack to be able to retrieve the xml:base property, no way to access xml:lang inside <feed>
  300. $downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string);
  301. return $downloaded_string;
  302. }
  303. /**
  304. * Parse atom feeds.
  305. */
  306. function _parser_common_syndication_atom10_parse($feed_XML) {
  307. $parsed_source = new stdClass();
  308. $base = (string) array_shift(($feed_XML->xpath("@base")));
  309. if (!valid_url($base, TRUE)) {
  310. $base = FALSE;
  311. }
  312. // Detect the title
  313. $parsed_source->title = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
  314. // Detect the description
  315. $parsed_source->description = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
  316. $parsed_source->options = new stdClass();
  317. $parsed_source->options->link = _parser_common_syndication_link($feed_XML->link);
  318. if (valid_url($parsed_source->options->link) && !valid_url($parsed_source->options->link, TRUE) && !empty($base)) {
  319. $parsed_source->options->link = $base . $parsed_source->options->link;
  320. }
  321. $parsed_source->items = array();
  322. foreach ($feed_XML->entry as $news) {
  323. $original_url = NULL;
  324. $guid = !empty($news->id) ? "{$news->id}" : NULL;
  325. // I don't know how standard this is, but sometimes the id is the URL.
  326. if (valid_url($guid, TRUE)) {
  327. $original_url = $guid;
  328. }
  329. $additional_taxonomies = array();
  330. if (isset($news->category)) {
  331. $additional_taxonomies['ATOM Categories'] = array();
  332. $additional_taxonomies['ATOM Domains'] = array();
  333. foreach ($news->category as $category) {
  334. $additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
  335. if (isset($category['scheme'])) {
  336. $domain = "{$category['scheme']}";
  337. if (!empty($domain)) {
  338. if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
  339. $additional_taxonomies['ATOM Domains'][$domain] = array();
  340. }
  341. $additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
  342. }
  343. }
  344. }
  345. }
  346. $title = "{$news->title}";
  347. if (!empty($news->content)) {
  348. $body = '';
  349. foreach ($news->content->children() as $child) {
  350. $body .= $child->asXML();
  351. }
  352. $body .= "{$news->content}";
  353. }
  354. else if (!empty($news->summary)) {
  355. $body = '';
  356. foreach ($news->summary->children() as $child) {
  357. $body .= $child->asXML();
  358. }
  359. $body .= "{$news->summary}";
  360. }
  361. if (!empty($news->content['src'])) {
  362. // some src elements in some valid atom feeds contained no urls at all
  363. if (valid_url("{$news->content['src']}")) {
  364. $original_url = "{$news->content['src']}";
  365. }
  366. }
  367. $author_found = FALSE;
  368. if (!empty($news->source->author->name)) {
  369. $original_author = "{$news->source->author->name}";
  370. $author_found = TRUE;
  371. }
  372. else if (!empty($news->author->name)) {
  373. $original_author = "{$news->author->name}";
  374. $author_found = TRUE;
  375. }
  376. if (!empty($feed_XML->author->name) && !$author_found) {
  377. $original_author = "{$feed_XML->author->name}";
  378. }
  379. $original_url = _parser_common_syndication_link($news->link);
  380. $item = new stdClass();
  381. $item->title = _parser_common_syndication_title($title);
  382. $item->description = $body;
  383. $item->options = new stdClass();
  384. $item->options->original_author = $original_author;
  385. $item->options->timestamp = _parser_common_syndication_parse_date(isset($news->published) ? "{$news->published}" : "{$news->issued}");
  386. $item->options->original_url = $original_url;
  387. if (valid_url($item->options->original_url) && !valid_url($item->options->original_url, TRUE) && !empty($base)) {
  388. $item->options->original_url = $base . $item->options->original_url;
  389. }
  390. $item->options->guid = $guid;
  391. $item->options->tags = $additional_taxonomies['ATOM Categories'];
  392. $item->options->domains = $additional_taxonomies['ATOM Domains'];
  393. $parsed_source->items[] = $item;
  394. }
  395. return $parsed_source;
  396. }
  397. /**
  398. * Parse RSS1.0/RDF feeds.
  399. */
  400. function _parser_common_syndication_RDF10_parse($feed_XML) {
  401. $parsed_source = new stdClass();
  402. // Detect the title.
  403. $parsed_source->title = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
  404. // Detect the description.
  405. $parsed_source->description = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
  406. $parsed_source->options = new stdClass();
  407. // Detect the link.
  408. $parsed_source->options->link = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
  409. $parsed_source->items = array();
  410. // Set category splitter (space is for del.icio.us feed).
  411. $category_splitter = ' ';
  412. // Get the default original author.
  413. if ($feed_XML->channel->title) {
  414. $oa = "{$feed_XML->channel->title}";
  415. }
  416. // Get all namespaces.
  417. if (version_compare(phpversion(), '5.1.2', '<')) {
  418. // Versions prior 5.1.2 don't allow namespaces.
  419. $namespaces['default'] = NULL;
  420. }
  421. else {
  422. $namespaces = $feed_XML->getNamespaces(TRUE);
  423. }
  424. foreach ($feed_XML->item as $news) {
  425. // Initialization.
  426. $guid = $original_url = NULL;
  427. $title = $body = '';
  428. $additional_taxonomies = array();
  429. $original_author = $oa;
  430. foreach ($namespaces as $ns_link) {
  431. // Get about attribute as guid.
  432. foreach ($news->attributes($ns_link) as $name => $value) {
  433. if ($name == 'about') {
  434. $guid = "{$value}";
  435. }
  436. }
  437. // Get children for current namespace.
  438. if (version_compare(phpversion(), '5.1.2', '<')) {
  439. $ns = (array) $news;
  440. }
  441. else {
  442. $ns = (array) $news->children($ns_link);
  443. }
  444. // Title
  445. if (!empty($ns['title'])) {
  446. $title = "{$ns['title']}";
  447. }
  448. // Description or dc:description
  449. if (!empty($ns['description']) && $body == '') {
  450. $body = "{$ns['description']}";
  451. }
  452. // Link
  453. if (!empty($ns['link'])) {
  454. $original_url = "{$ns['link']}";
  455. }
  456. // dc:creator
  457. if (!empty($ns['creator'])) {
  458. $original_author = "{$ns['creator']}";
  459. }
  460. // content:encoded
  461. if (!empty($ns['encoded'])) {
  462. $body = "{$ns['encoded']}";
  463. }
  464. // dc:subject
  465. if (!empty($ns['subject'])) {
  466. // There can be multiple category tags.
  467. if (is_array($ns['subject'])) {
  468. foreach ($ns['subject'] as $cat) {
  469. if (is_object($cat)) {
  470. $additional_taxonomies['RDF Categories'][] = trim(strip_tags($cat->asXML()));
  471. }
  472. else {
  473. $additional_taxonomies['RDF Categories'][] = $cat;
  474. }
  475. }
  476. }
  477. else { //or single tag
  478. $additional_taxonomies['RDF Categories'] = explode($category_splitter, "{$ns['subject']}");
  479. }
  480. }
  481. }
  482. // The description is not mandatory so use title if description not present.
  483. if (empty($body)) {
  484. $body = $title;
  485. }
  486. // If there are no link tag but rdf:about is provided.
  487. if (empty($original_url) && !empty($guid)) {
  488. $original_url = $guid;
  489. }
  490. $item = new stdClass();
  491. $item->title = _parser_common_syndication_title($title);
  492. $item->description = $body;
  493. $item->options = new stdClass();
  494. $item->options->original_author = $original_author;
  495. $item->options->timestamp = _parser_common_syndication_parse_date(empty($ns['pubDate']) ? "{$ns['date']}" : "{$ns['date']}");
  496. $item->options->original_url = $original_url;
  497. $item->options->guid = $guid;
  498. $item->options->tags = $additional_taxonomies['RDF Categories'];
  499. $parsed_source->items[] = $item;
  500. }
  501. return $parsed_source;
  502. }
  503. /**
  504. * Parse RSS2.0 feeds.
  505. */
  506. function _parser_common_syndication_RSS20_parse($feed_XML) {
  507. $parsed_source = new stdClass();
  508. // Detect the title.
  509. $parsed_source->title = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
  510. // Detect the description.
  511. $parsed_source->description = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
  512. $parsed_source->options = new stdClass();
  513. // Detect the link.
  514. $parsed_source->options->link = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
  515. $parsed_source->items = array();
  516. foreach ($feed_XML->xpath('//item') as $news) {
  517. // for PHP > 5.1.2 get 'content' namespace
  518. $category = $news->xpath('category');
  519. // Get children for current namespace.
  520. if (version_compare(phpversion(), '5.1.2', '>')) {
  521. $content = (array)$news->children('http://purl.org/rss/1.0/modules/content/');
  522. }
  523. $news = (array)$news;
  524. $news['category'] = $category;
  525. if (isset($news['guid'])) {
  526. $guid = $news['guid'];
  527. }
  528. else {
  529. $guid = NULL;
  530. }
  531. if (isset($news['title'])) {
  532. $title = "{$news['title']}";
  533. }
  534. else {
  535. $title = '';
  536. }
  537. if (isset($news['description'])) {
  538. $body = "{$news['description']}";
  539. }
  540. // Some sources use content:encoded as description i.e. PostNuke PageSetter module.
  541. elseif (isset($news['encoded'])) { // content:encoded for PHP < 5.1.2.
  542. $body = "{$news['encoded']}";
  543. }
  544. elseif (isset($content['encoded'])) { // content:encoded for PHP >= 5.1.2.
  545. $body = "{$content['encoded']}";
  546. }
  547. else {
  548. $body = "{$news['title']}";
  549. }
  550. if (!empty($feed_XML->channel->title)) {
  551. $original_author = "{$feed_XML->channel->title}";
  552. }
  553. if (!empty($news['link'])) {
  554. $original_url = "{$news['link']}";
  555. }
  556. else {
  557. $original_url = NULL;
  558. }
  559. $additional_taxonomies = array();
  560. $additional_taxonomies['RSS Categories'] = array();
  561. $additional_taxonomies['RSS Domains'] = array();
  562. if (isset($news['category'])) {
  563. foreach ($news['category'] as $category) {
  564. $additional_taxonomies['RSS Categories'][] = "{$category}";
  565. if (isset($category['domain'])) {
  566. $domain = "{$category['domain']}";
  567. if (!empty($domain)) {
  568. if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
  569. $additional_taxonomies['RSS Domains'][$domain] = array();
  570. }
  571. $additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
  572. }
  573. }
  574. }
  575. }
  576. $item = new stdClass();
  577. $item->title = _parser_common_syndication_title($title);
  578. $item->description = $body;
  579. $item->options = new stdClass();
  580. $item->options->original_author = $original_author;
  581. $item->options->timestamp = _parser_common_syndication_parse_date($news['pubDate']);
  582. $item->options->original_url = $original_url;
  583. $item->options->guid = $guid;
  584. $item->options->tags = $additional_taxonomies['RSS Categories'];
  585. $item->options->domains = $additional_taxonomies['RSS Domains'];
  586. $parsed_source->items[] = $item;
  587. }
  588. return $parsed_source;
  589. }
  590. /**
  591. * Set the default caching directory if the current setting is not useable
  592. */
  593. function _parser_common_syndication_sanitize_cache() {
  594. $cache_location = file_directory_path() .'/parser_common_syndication_cache';
  595. if (!is_writeable($cache_location) || !is_dir($cache_location)) {
  596. $cache_location = file_create_path($cache_location);
  597. if (!file_exists($cache_location) && is_writable(file_directory_path())) {
  598. mkdir($cache_location);
  599. }
  600. if (!is_writeable($cache_location)) {
  601. return FALSE;
  602. }
  603. }
  604. return $cache_location;
  605. }
  606. /**
  607. * Parse a date comes from a feed.
  608. *
  609. * @param $date_string
  610. * The date string in various formats.
  611. * @return
  612. * The timestamp of the string or the current time if can't be parsed
  613. */
  614. function _parser_common_syndication_parse_date($date_str) {
  615. $parsed_date = strtotime($date_str);
  616. if ($parsed_date === FALSE || $parsed_date == -1) {
  617. $parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
  618. }
  619. return $parsed_date === FALSE ? time() : $parsed_date;
  620. }
  621. /**
  622. * Parse the W3C date/time format, a subset of ISO 8601.
  623. *
  624. * PHP date parsing functions do not handle this format.
  625. * See http://www.w3.org/TR/NOTE-datetime for more information.
  626. * Originally from MagpieRSS (http://magpierss.sourceforge.net/).
  627. *
  628. * @param $date_str
  629. * A string with a potentially W3C DTF date.
  630. * @return
  631. * A timestamp if parsed successfully or FALSE if not.
  632. */
  633. function _parser_common_syndication_parse_w3cdtf($date_str) {
  634. if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
  635. list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
  636. // Calculate the epoch for current date assuming GMT.
  637. $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
  638. if ($match[10] != 'Z') { // Z is zulu time, aka GMT
  639. list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
  640. // Zero out the variables.
  641. if (!$tz_hour) {
  642. $tz_hour = 0;
  643. }
  644. if (!$tz_min) {
  645. $tz_min = 0;
  646. }
  647. $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
  648. // Is timezone ahead of GMT? If yes, subtract offset.
  649. if ($tz_mod == '+') {
  650. $offset_secs *= -1;
  651. }
  652. $epoch += $offset_secs;
  653. }
  654. return $epoch;
  655. }
  656. else {
  657. return FALSE;
  658. }
  659. }
  660. /**
  661. * Extract the link that points to the original content (back to site or origi
  662. *
  663. * @param $links
  664. * Array of SimpleXML objects
  665. */
  666. function _parser_common_syndication_link($links) {
  667. $to_link = '';
  668. if (count($links) > 0) {
  669. foreach ($links as $link) {
  670. $link = $link->attributes();
  671. $to_link = isset($link["href"]) ? "{$link["href"]}" : "";
  672. if (isset($link["rel"])) {
  673. if ("{$link["rel"]}" == 'alternate') {
  674. break;
  675. }
  676. }
  677. }
  678. }
  679. return $to_link;
  680. }
  681. /**
  682. * Prepare raw data to be a title
  683. */
  684. function _parser_common_syndication_title($title) {
  685. return html_entity_decode(strip_tags($title), ENT_QUOTES, 'UTF-8');
  686. }