blog.Ring.idv.tw

Web Scraping for Yahoo!生活+

Web Scraping for Yahoo!生活+


在自然語言處理的領域中,有愈來愈多直接將網路資料當做語料庫(Corpus)來使用的趨勢~ 而要達成這樣的應用,當然需要有一個類似Web Crawler的機器人程式~ 來幫我們進行前置的搜集作業~ 而本文就是要做一個這樣的應用~ 這裡以「Yahoo!生活+」當做例子~

底下程式您可以自由地更改及使用~ 如果您有為它加上些許功能,也歡迎您貢獻您的成果 ^^

WebScraping.php

<?php
/**
 * Date: 2008/04/26
 * Shen(http://blog.ring.idv.tw)
 */
interface WebScraping
{
	public function doQuery();
	public function getBody();
}
?>

YahooLife.php

<?php
/**
 * Date: 2008/04/26
 * Shen(http://blog.ring.idv.tw)
 */
require_once "HttpClient.php";
require_once "WebScraping.php";

class YahooLife implements WebScraping
{
	private $param = "";
	private $body = "";
	
	function __construct(){}
	public function setKeyword($keyword)
	{
		$this->param = urlencode($keyword);
	}
	public function doQuery()
	{
		if($this->param != "")
		{
			$body = HttpClient::quickGet('http://twsearch.lifestyle.yahoo.com/search?cate=store&type=biz&p=' . $this->param);
			$body = str_replace("<em>",'',$body);
			$body = str_replace("</em>",'',$body);
			$this->body = $body;
			return true;
		}
		return false;
	}
	public function getBody()
	{
		return $this->body;
	}
	public function getStoreAddress()
	{
		$regex = '/<address>(.*)<\/address>/Us';
		preg_match_all($regex,$this->body,$match);
		return $match[1];
	}
	public function getSatisfaction()
	{
		$regex = '/<img src=\"http:\/\/tw.yimg.com\/i\/tw\/lifestyle\/icon_star0(.*).gif/Us';
		preg_match_all($regex,$this->body,$match);
		return $match[1];
	}
	public function getStoreID()
	{
		$regex = '/http:\/\/tw.wrs.yahoo.com\/\*\*http%3A%2F%2Ftw.lifestyle.yahoo.com%2Fbiz.html%3Fbizid%3D(.*)\">/Us';
		preg_match_all($regex,$this->body,$match);
		return $match[1];	
	}
}
?>

YahooStore.php

<?php
/**
 * Date: 2008/04/26
 * Shen(http://blog.ring.idv.tw)
 */
require_once "HttpClient.php";
require_once "WebScraping.php";

class YahooStore implements WebScraping
{
	private $storeId = "";
	private $body = "";
	
	function __construct(){}
	public function setStoreID($id)
	{
		$this->storeId = $id;
	}
	public function doQuery()
	{
		if($this->storeId != "")
		{
			$body = HttpClient::quickGet("http://tw.lifestyle.yahoo.com/biz_comment.html?bizid=" . $this->storeId . "&psm=");
			$this->body = $body;
			return true;
		}
		return false;
	}
	public function getBody()
	{
		return $this->body;
	}
	public function getStoreComments()
	{
		$regex = '/<blockquote>(.*)<\/blockquote>/Us';
		preg_match_all($regex,$this->body,$match);
		return $match[1];
	}
	public function getSatisfaction()
	{
		$regex = '/<img src=\"http:\/\/tw.yimg.com\/i\/tw\/lifestyle\/icon_star0(.*).gif/Us';
		preg_match_all($regex,$this->body,$match);
		return $match[1];
	}
}
?>

測試範例

<?
require_once "YahooLife.php";
require_once "YahooStore.php";

$Ylife = new YahooLife();
$Ylife->setKeyword("美食");
$Ylife->doQuery();
$address = $Ylife->getStoreAddress();
echo "<h2>商家地址</h2>";
show($address);

$satisfaction = $Ylife->getSatisfaction();
echo "<h2>商家滿意度</h2>";
show($satisfaction);

$storeid = $Ylife->getStoreID();
echo "<h2>商家ID</h2>";
show($storeid);

// sleep for 2 seconds
sleep(2);

$Ystore = new YahooStore();
$Ystore->setStoreID($storeid[1]);
$Ystore->doQuery();

$comments = $Ystore->getStoreComments();
echo "<h2>商家評價意見</h2>";
show($comments);

$satisfaction = $Ystore->getSatisfaction();
echo "<h2>商家評價滿意度</h2>";
show($satisfaction);

function show($value)
{
	foreach($value as $v)
		echo $v . "<br/>";	
}
?>

取得原始碼

2008-04-26 21:11:47

Leave a Comment

Copyright (C) Ching-Shen Chen. All rights reserved.

::: 搜尋 :::

::: 分類 :::

::: Ads :::

::: 最新文章 :::

::: 最新回應 :::

::: 訂閱 :::

Atom feed
Atom Comment