在自然語言處理的領域中,有愈來愈多直接將網路資料當做語料庫(Corpus)來使用的趨勢~ 而要達成這樣的應用,當然需要有一個類似Web Crawler的機器人程式~ 來幫我們進行前置的搜集作業~ 而本文就是要做一個這樣的應用~ 這裡以「Yahoo!生活+」當做例子~
底下程式您可以自由地更改及使用~ 如果您有為它加上些許功能,也歡迎您貢獻您的成果 ^^
WebScraping.php
<?php
/**
* Date: 2008/04/26
* Shen(http://blog.ring.idv.tw)
*/
interface WebScraping
{
public function doQuery();
public function getBody();
}
?>
YahooLife.php
<?php
/**
* Date: 2008/04/26
* Shen(http://blog.ring.idv.tw)
*/
require_once "HttpClient.php";
require_once "WebScraping.php";
class YahooLife implements WebScraping
{
private $param = "";
private $body = "";
function __construct(){}
public function setKeyword($keyword)
{
$this->param = urlencode($keyword);
}
public function doQuery()
{
if($this->param != "")
{
$body = HttpClient::quickGet('http://twsearch.lifestyle.yahoo.com/search?cate=store&type=biz&p=' . $this->param);
$body = str_replace("<em>",'',$body);
$body = str_replace("</em>",'',$body);
$this->body = $body;
return true;
}
return false;
}
public function getBody()
{
return $this->body;
}
public function getStoreAddress()
{
$regex = '/<address>(.*)<\/address>/Us';
preg_match_all($regex,$this->body,$match);
return $match[1];
}
public function getSatisfaction()
{
$regex = '/<img src=\"http:\/\/tw.yimg.com\/i\/tw\/lifestyle\/icon_star0(.*).gif/Us';
preg_match_all($regex,$this->body,$match);
return $match[1];
}
public function getStoreID()
{
$regex = '/http:\/\/tw.wrs.yahoo.com\/\*\*http%3A%2F%2Ftw.lifestyle.yahoo.com%2Fbiz.html%3Fbizid%3D(.*)\">/Us';
preg_match_all($regex,$this->body,$match);
return $match[1];
}
}
?>
YahooStore.php
<?php
/**
* Date: 2008/04/26
* Shen(http://blog.ring.idv.tw)
*/
require_once "HttpClient.php";
require_once "WebScraping.php";
class YahooStore implements WebScraping
{
private $storeId = "";
private $body = "";
function __construct(){}
public function setStoreID($id)
{
$this->storeId = $id;
}
public function doQuery()
{
if($this->storeId != "")
{
$body = HttpClient::quickGet("http://tw.lifestyle.yahoo.com/biz_comment.html?bizid=" . $this->storeId . "&psm=");
$this->body = $body;
return true;
}
return false;
}
public function getBody()
{
return $this->body;
}
public function getStoreComments()
{
$regex = '/<blockquote>(.*)<\/blockquote>/Us';
preg_match_all($regex,$this->body,$match);
return $match[1];
}
public function getSatisfaction()
{
$regex = '/<img src=\"http:\/\/tw.yimg.com\/i\/tw\/lifestyle\/icon_star0(.*).gif/Us';
preg_match_all($regex,$this->body,$match);
return $match[1];
}
}
?>
測試範例
<?
require_once "YahooLife.php";
require_once "YahooStore.php";
$Ylife = new YahooLife();
$Ylife->setKeyword("美食");
$Ylife->doQuery();
$address = $Ylife->getStoreAddress();
echo "<h2>商家地址</h2>";
show($address);
$satisfaction = $Ylife->getSatisfaction();
echo "<h2>商家滿意度</h2>";
show($satisfaction);
$storeid = $Ylife->getStoreID();
echo "<h2>商家ID</h2>";
show($storeid);
// sleep for 2 seconds
sleep(2);
$Ystore = new YahooStore();
$Ystore->setStoreID($storeid[1]);
$Ystore->doQuery();
$comments = $Ystore->getStoreComments();
echo "<h2>商家評價意見</h2>";
show($comments);
$satisfaction = $Ystore->getSatisfaction();
echo "<h2>商家評價滿意度</h2>";
show($satisfaction);
function show($value)
{
foreach($value as $v)
echo $v . "<br/>";
}
?>
