$HOST, 'username' => $USER_NAME, 'password' => $PASSWORD, 'dbname' => $DATABASE_NAME )); $db->query("SET NAMES utf8"); $feed = new Zend_Feed_Atom('http://www.google.com/reader/public/atom/user/00081205862505704902/label/anonymous?r=n&n=40'); foreach($feed as $entry) { $dom_document = new DOMDocument(); $dom_document->loadHTML($entry->summary()); $dom_xpath = new DOMXpath($dom_document); $title = $entry->title(); $date_published = $entry->published(); $date_updated = $entry->updated(); // Extracts all links to news articles $elements = $dom_xpath->query("//td[2]/font/div[2]/a/@href"); if (!is_null($elements)) { foreach ($elements as $element) { $nodes = $element->childNodes; foreach ($nodes as $node) { // Exclude more links if(!substr_count($node->nodeValue,'news/more')) { $link = $node->nodeValue; } } } } // Extracts name of publication $elements = $dom_xpath->query("//div[2]/font/b/font"); if (!is_null($elements)) { foreach ($elements as $element) { $nodes = $element->childNodes; foreach ($nodes as $node) { // Exclude more links if(!substr_count($node->nodeValue,'news/more')) { $name = $node->nodeValue; } } } } // Extracts anonymous quote text $elements = $dom_xpath->query("//font[last()-2]"); if (!is_null($elements)) { foreach ($elements as $element) { $text = ''; $nodes = $element->childNodes; foreach ($nodes as $node) { $new_text = $node->nodeValue; $text = $text . $new_text; } } } $search = array( 'did not wish to be identified', 'did not want to be identified', 'declined to be identified', 'refused to be identified', 'an anonymous source', 'An anonymous source', 'asked not to be identified', 'declined to give her name', 'declined to give his name', 'on condition of anonymity', 'requested anonymity', 'asked that his name not be used', 'asked that her name not be used', 'refused to give her name', 'refused to give his name', 'sources close to', 'a source close to', 'A source close to', 'asked not to be named', 'declined to be named', 'refused to be named', 'wouldn\'t give his name', 'wouldn\'t give her name', 'spoke on background', 'speaking on background', 'spoke off the record', 'speaking off the record', 'speak off the record', 'comment off the record', 'would not speak for attribution', 'declined to speak for attribution', 'refused to speak for attribution', 'asked to remain anonymous', 'the source said', 'a source said', 'sources said', 'according to people familiar with', 'an official close to', 'a person briefed on the matter', 'insisted on anonymity', 'chose to remain anonymous' ); $replace = array( 'did not wish to be identified', 'did not want to be identified', 'declined to be identified', 'refused to be identified', 'an anonymous source', 'An anonymous source', 'asked not to be identified', 'declined to give her name', 'declined to give his name', 'on condition of anonymity', 'requested anonymity', 'asked that his name not be used', 'asked that her name not be used', 'refused to give her name', 'refused to give his name', 'sources close to', 'a source close to', 'A source close to', 'asked not to be named', 'declined to be named', 'refused to be named', 'wouldn\'t give his name', 'wouldn\'t give her name', 'spoke on background', 'speaking on background', 'spoke off the record', 'speaking off the record', 'speak off the record', 'comment off the record', 'would not speak for attribution', 'declined to speak for attribution', 'refused to speak for attribution', 'asked to remain anonymous', 'the source said', 'a source said', 'sources said', 'according to people familiar with', 'an official close to', 'a person briefed on the matter', 'insisted on anonymity', 'chose to remain anonymous' ); // Only store examples with anonymous phrasing foreach($search as $pattern){ if(preg_match("/$pattern/",$text)) { $sql = "select * from anonymous where url = ?"; $result = $db->fetchAll($sql, $link); $rowCount = count($result); if($rowCount==0){ if($link){ try { $full_text = file_get_contents($link); } catch (Exception $e) { echo 'Caught exception: ', $e->getMessage(), "\n"; } } else { $full_text = ''; } $data = array( 'title' => $title, 'summary' => utf8_decode(str_replace($search, $replace, $text)), 'url' => $link, 'outlet' => $name, 'date_published' => $date_published, 'date_updated' => $date_updated, 'full_text' => $full_text ); try { $db->insert('anonymous', $data); } catch (Exception $e) { echo 'Caught exception: ', $e->getMessage(), "\n"; } } } } } echo "Finished\n";