This snippet of PHP code demonstrates web scraping. It reads a sample page from Amazon.com, compares the HTML text against certain class name and outputs that matched text in an RSS feed.
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
<?php $now = date("D, d M Y H:i:s T"); $ASIN = $url = $img = $title = $bio = $name = ""; $head = '<?xml version="1.0" encoding="ISO-8859-1"?>'; $head .= '<rss version="2.0">'; $head .= '<channel>'; $head .= '<title>Amazon </title>'; $head .= '<link>http://www.amazon.com</link>'; $head .= '<description>Amazon RSS Feed</description>'; $url = "http://www.amazon.com/Best-Sellers-Kindle-Store/zgbs/digital-text/"; $text = file_get_html($url); foreach ($text->find("div.zg_item_compact") as $class) { foreach ($class->find('strong.price') as $price) { if ($price->plaintext == "Free") { $rssfeed .= '<item>'; foreach ($class->find("div.zg_title a") as $book) { preg_match("/\/dp\/(.*)\/ref/", $book->href, $matches); $ASIN = trim($matches[1]); $url = "http://www.amazon.com/dp/" . $ASIN . "/?tag=publisherapi-20"; $img = "http://images.amazon.com/images/P/" . $ASIN . ".01.LZZZZZZZ.jpg"; $title = htmlentities(trim($book->plaintext)); $rssfeed .= '<title>' . $title . '</title>'; $rssfeed .= '<link>' . $url . '</link>'; $rssfeed .= '<guid isPermaLink="true">' . $url . '</guid>'; $rssfeed .= '<description>'; } foreach ($class->find("div.zg_byline a") as $author) { $bio = "http://www.amazon.com" . $author->href . "/?tag=publisherapi-20"; $name = htmlentities(trim($author->plaintext)); $rssfeed .= 'By <a href="' . $authorURL . '">' . $name . '</a>'; } $rssfeed .= '</description>'; $rssfeed .= '<pubDate>' . $now . '</pubDate>'; $rssfeed .= '</item>'; } } } $footer = '</channel></rss>'; $rssfeed = $head . $rssfeed . $footer; $fh = fopen("amazon.rss", "w"); fwrite($fh, $rssfeed); fclose($fh); ?> |