<?php
 
include 'lib/phpQuery-onefile.php';
 
/**
 
 * 
 
 * This class allows you to get data from any site.
 
 * The data are taken from defined locations in the DOM structure.
 
 * Data points are defined using the phpquery notation  - similar to the selectors used in JQuery library.
 
 * This class can fetch data in three different modes by:
 
 *  - scanning a single page
 
 *  - scanning a "from->to" range of pages matching defined URL schema
 
 *  - scanning a list of URLs retrieved from a PHP array  
 
 * 
 
 * @example scrap single page
 
 * $scrap = new Scraper();
 
 * $scrap->setBaseUrl('http://page.to.scrap/index.html');
 
 * $scrap->addDataTarget('title', '#product h1');
 
 * $data = $scrap->process();
 
 * 
 
 * @example scrap range of pages
 
 * $scrap = new Scraper();
 
 * $scrap->setBaseUrl('http://example.url.com/details.html?id=##TOKEN##');
 
 * $scrap->addRangeScanRule(151598039, 151598042, '##TOKEN##');
 
 * $scrap->addDataTarget('name', '#head1 .title h1');
 
 * $data = $scrap->process();
 
 *
 
 * @example scrap list of custom urls
 
 * $scrap = new Scraper();
 
 * $myUrls = array('http://site.ccm/ulr1/', 'http://site.ccm/ulr2/', 'http://site.ccm/ulr3/'); 
 
 * $scrap->addListScanRule($myUrls);
 
 * $scrap->addDataTarget('title', '#content .ogloszenie_item h1');
 
 * $scrap->addDataTarget('image', '#content .ogloszenie_thumb a');
 
 * $scrap->addDataTarget('price', '#content .ogloszenie_item:contains(\'Cena:\')');
 
 * $data = $scrap->process();
 
 * 
 
 * Downloaded data is returned as the array.
 
 * You can do whatever you want with the data;) 
 
 *
 
 * @package Scraper
 
 * @see This class  uses phpquery library  
 
 * @link http://code.google.com/p/phpquery/
 
 * 
 
 * @author JLukasiewicz jlukasie at gmail
 
 *
 
 */
 
class Scraper 
 
{
 
    /**
 
     * 
 
     * base url to range/single -scan  
 
     * @var string
 
     */
 
    private $baseUrl = '';
 
    
 
    /**
 
     * 
 
     * scan rule
 
     * Can be 'range' or 'list' type
 
     * @var array
 
     */
 
    private $scanRule = array();
 
 
    /**
 
     * 
 
     * Data points in phpquery notation
 
     * @var array
 
     */
 
    private $dataTargets = array();
 
    
 
    
 
    public function __construct() 
 
    {
 
        
 
    }
 
    
 
    /**
 
     * 
 
     *  baseUrl setter
 
     * @param string $url
 
     * @throws Exception
 
     */
 
    public function setBaseUrl($url)
 
    {
 
        if (empty($url))
 
        {
 
            throw new Exception('Value not specified: url', 1);
 
        }
 
        
 
        $this->baseUrl = $url;
 
    }
 
 
    /**
 
     * 
 
     * scanRule setter
 
     * @param string $type
 
     * @param mixed $value
 
     * @throws Exception
 
     */
 
    private function setScanRule($type, $value)
 
    {
 
        
 
        if (empty($type) || empty($value))
 
        {
 
            throw new Exception('Value not specified: type or value', 1);
 
        }
 
 
        $this->scanRule[$type] = $value; 
 
    }
 
    
 
    
 
    /**
 
     * 
 
     * add range rule
 
     * @param int $min
 
     * @param int $max
 
     * @param string $token
 
     * @throws Exception
 
     */
 
    public function addRangeScanRule($min, $max, $token)
 
    {
 
        if (empty($min) || empty($max))
 
        {
 
            throw new Exception('Value not specified: min or max', 1);
 
        }
 
        
 
        $this->setScanRule('range', array('min' => $min, 'max' => $max, 'token' => $token) );
 
    }
 
    
 
    /**
 
     * 
 
     * add list scan rule
 
     * @param array $list
 
     * @throws Exception
 
     */
 
    public function addListScanRule($list)
 
    {
 
        if (empty($list) || !is_array($list))
 
        {
 
            throw new Exception('address list is not specified', 1);
 
        }        
 
        
 
        $this->setScanRule('list', $list);
 
    }
 
    
 
    /**
 
     * 
 
     * add data point
 
     * @param string $name
 
     * @param string $selector
 
     * @throws Exception
 
     */
 
    public function addDataTarget($name, $selector)
 
    {
 
        if (empty($name) || empty($selector))
 
        {
 
            throw new Exception('Value not specified: name or selector', 1);
 
        }
 
        
 
        $this->dataTargets[$name] = $selector;
 
    }
 
    
 
    /**
 
     * 
 
     * perform scan
 
     */
 
    public function process()
 
    {
 
        $data = array();
 
        
 
        $urls = $this->getUrlsToScan();
 
        
 
        foreach ($urls as $url)
 
        {
 
            if(!($input = @file_get_contents($url)))
 
            {
 
                continue;
 
            } 
 
            phpQuery::newDocumentFileHTML($url);
 
            unset($scrap);
 
            foreach ($this->dataTargets as $name => $selector)
 
            {
 
                $scrap[$name] = pq($selector)->html();    
 
            }
 
            if(!empty($scrap))
 
            {
 
                $data[] = $scrap;
 
            }
 
        }
 
        
 
        return $data;
 
    }
 
    
 
    
 
    /**
 
     * 
 
     * construct url list to scan
 
     * @throws Exception
 
     */
 
    private function getUrlsToScan()
 
    {
 
    
 
        $urls = array();
 
        
 
        if (!empty($this->scanRule))
 
        {
 
            if(!empty($this->scanRule['range']))
 
            {
 
                if (empty($this->baseUrl))
 
                {
 
                    throw new Exception('baseUrl not specified', 2);
 
                }
 
                if(!empty($this->scanRule['range']['min']) && !empty($this->scanRule['range']['max']))
 
                {
 
                    for($i = $this->scanRule['range']['min']; $i <= $this->scanRule['range']['max']; $i++)
 
                    {
 
                        $urls[] = str_replace($this->scanRule['range']['token'], $i, $this->baseUrl);
 
                    }
 
                } 
 
                else 
 
                {
 
                    throw new Exception('scanRule invalid format', 3);
 
                }
 
            }
 
            elseif (!empty($this->scanRule['list']))
 
            {
 
                $urls = $this->scanRule['list'];
 
            }
 
        }
 
        else
 
        {
 
            $urls = array($this->baseUrl);
 
        }
 
        return $urls;
 
    }
 
}
 
 |