在自然語言處理的領域中,有愈來愈多直接將網路資料當做語料庫(Corpus)來使用的趨勢~ 而要達成這樣的應用,當然需要有一個類似Web Crawler的機器人程式~ 來幫我們進行前置的搜集作業~ 而本文就是要做一個這樣的應用~ 這裡以「Yahoo!生活+」當做例子~
底下程式您可以自由地更改及使用~ 如果您有為它加上些許功能,也歡迎您貢獻您的成果 ^^
WebScraping.php
<?php /** * Date: 2008/04/26 * Shen(http://blog.ring.idv.tw) */ interface WebScraping { public function doQuery(); public function getBody(); } ?>
YahooLife.php
<?php /** * Date: 2008/04/26 * Shen(http://blog.ring.idv.tw) */ require_once "HttpClient.php"; require_once "WebScraping.php"; class YahooLife implements WebScraping { private $param = ""; private $body = ""; function __construct(){} public function setKeyword($keyword) { $this->param = urlencode($keyword); } public function doQuery() { if($this->param != "") { $body = HttpClient::quickGet('http://twsearch.lifestyle.yahoo.com/search?cate=store&type=biz&p=' . $this->param); $body = str_replace("<em>",'',$body); $body = str_replace("</em>",'',$body); $this->body = $body; return true; } return false; } public function getBody() { return $this->body; } public function getStoreAddress() { $regex = '/<address>(.*)<\/address>/Us'; preg_match_all($regex,$this->body,$match); return $match[1]; } public function getSatisfaction() { $regex = '/<img src=\"http:\/\/tw.yimg.com\/i\/tw\/lifestyle\/icon_star0(.*).gif/Us'; preg_match_all($regex,$this->body,$match); return $match[1]; } public function getStoreID() { $regex = '/http:\/\/tw.wrs.yahoo.com\/\*\*http%3A%2F%2Ftw.lifestyle.yahoo.com%2Fbiz.html%3Fbizid%3D(.*)\">/Us'; preg_match_all($regex,$this->body,$match); return $match[1]; } } ?>
YahooStore.php
<?php /** * Date: 2008/04/26 * Shen(http://blog.ring.idv.tw) */ require_once "HttpClient.php"; require_once "WebScraping.php"; class YahooStore implements WebScraping { private $storeId = ""; private $body = ""; function __construct(){} public function setStoreID($id) { $this->storeId = $id; } public function doQuery() { if($this->storeId != "") { $body = HttpClient::quickGet("http://tw.lifestyle.yahoo.com/biz_comment.html?bizid=" . $this->storeId . "&psm="); $this->body = $body; return true; } return false; } public function getBody() { return $this->body; } public function getStoreComments() { $regex = '/<blockquote>(.*)<\/blockquote>/Us'; preg_match_all($regex,$this->body,$match); return $match[1]; } public function getSatisfaction() { $regex = '/<img src=\"http:\/\/tw.yimg.com\/i\/tw\/lifestyle\/icon_star0(.*).gif/Us'; preg_match_all($regex,$this->body,$match); return $match[1]; } } ?>
測試範例
<? require_once "YahooLife.php"; require_once "YahooStore.php"; $Ylife = new YahooLife(); $Ylife->setKeyword("美食"); $Ylife->doQuery(); $address = $Ylife->getStoreAddress(); echo "<h2>商家地址</h2>"; show($address); $satisfaction = $Ylife->getSatisfaction(); echo "<h2>商家滿意度</h2>"; show($satisfaction); $storeid = $Ylife->getStoreID(); echo "<h2>商家ID</h2>"; show($storeid); // sleep for 2 seconds sleep(2); $Ystore = new YahooStore(); $Ystore->setStoreID($storeid[1]); $Ystore->doQuery(); $comments = $Ystore->getStoreComments(); echo "<h2>商家評價意見</h2>"; show($comments); $satisfaction = $Ystore->getSatisfaction(); echo "<h2>商家評價滿意度</h2>"; show($satisfaction); function show($value) { foreach($value as $v) echo $v . "<br/>"; } ?>