PHP пустой массив при запуске скрипта
Кто-нибудь знает, почему это больше не работает? https://andreyvoev.com/programming/simple-serp-tracker-php-class/
Он просто отобразит Array () ================
это не мой код, он принадлежит действительно умному парню. Кажется, что я не могу заставить это работать однако. Возвращает пустой массив. Я попытался запустить парсер на своем собственном сайте, и он все еще не работает, так что это не потому, что Google блокирует скрипт.
<?php
/**
* Simple SERP Tracker class
*
* http://www.andreyvoev.com/simple-serp-tracker-php-class
*
* @copyright Andrey Voev 2011
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* @author Andrey Voev <andreyvoev@gmail.com>
* @version 1.0
*
*/
abstract class Tracker
{
// the url that we will use as a base for our search
protected $baseurl;
// the site that we are searching for
protected $site;
// the keywords for the search
protected $keywords;
// the current page the crawler is on
protected $current;
// starting time of the search
protected $time_start;
// debug info array
protected $debug;
// the limit of the search results
protected $limit;
// proxy file value
protected $proxy;
public $found;
/**
* Constructor function for all new tracker instances.
*
* @param Array $keywords
* @param String $site
* @param Int $limit OPTIONAL: number of results to search
* @return tracker
*/
function __construct(array $keywords, $site, $limit = 100)
{
// the keywords we are searching for
$this->keywords = $keywords;
// the url of the site we are checking the position of
$this->site = $site;
// set the maximum results we will search trough
$this->limit = $limit;
// setup the array for the results
$this->found = array();
// starting position
$this->current = 0;
// start benchmarking
$this->time_start = microtime(true);
// set the time limit of the script execution - default is 6 min.
set_time_limit(360);
// check if all the required parameters are set
$this->initial_check();
}
/**
* Initial check if the base url is a string and if it has the required "keyword" and "position" keywords.
*/
protected function initial_check()
{
// get the model url from the extension class
$url = $this->set_baseurl();
// check if the url is a string
if(!is_string($url)) die("The url must be a string");
// check if the url has the keyword and parameter in it
$k = strpos($url, 'keyword');
$p = strpos($url, 'position');
if ($k === FALSE || $p === FALSE) die("Missing keyword or position parameter in URL");
}
/**
* Set up the proxy if used
*
* @param String $file OPTIONAL: if filename is not provided, the proxy will be turned off.
*/
public function use_proxy($file = FALSE)
{
// the name of the proxy txt file if any
$this->proxy = $file;
if($this->proxy != FALSE)
{
if(file_exists($this->proxy))
{
// get a proxy from a supplied file
$proxies = file($this->proxy);
// select a random proxy from the list
$this->proxy = $proxies[array_rand($proxies)];
}
else
{
die("The proxy file doesn't exist");
}
}
}
/**
* Parse the result from the crawler and pass the result html to the find function.
*
* @param String $single_url OPTIONAL: override the default url
* @return String $result;
*/
protected function parse(array $single_url = NULL)
{
// array of curl handles
$curl_handles = array();
// data to be returned
$result = array();
// multi handle
$mh = curl_multi_init();
// check if another URL is supplied
$urls = ($single_url == NULL) ? $this->baseurl : $single_url;
// loop through $data and create curl handles and add them to the multi-handle
foreach ($urls as $id => $d)
{
$curl_handles[$id] = curl_init();
$url = (is_array($d) && !empty($d['url'])) ? $d['url'] : $d;
curl_setopt($curl_handles[$id], CURLOPT_URL, $url);
curl_setopt($curl_handles[$id], CURLOPT_HEADER, 0);
curl_setopt($curl_handles[$id], CURLOPT_RETURNTRANSFER, 1);
if($this->proxy != FALSE)
{
// use the selected proxy
curl_setopt($curl_handles[$id], CURLOPT_HTTPPROXYTUNNEL, 0);
curl_setopt($curl_handles[$id], CURLOPT_PROXY, $this->proxy);
}
// is it post?
if (is_array($d))
{
if (!empty($d['post']))
{
curl_setopt($curl_handles[$id], CURLOPT_POST, 1);
curl_setopt($curl_handles[$id], CURLOPT_POSTFIELDS, $d['post']);
}
}
// are there any extra options?
if (!empty($options))
{
curl_setopt_array($curl_handles[$id], $options);
}
curl_multi_add_handle($mh, $curl_handles[$id]);
}
// execute the handles
$running = null;
do
{
curl_multi_exec($mh, $running);
}
while($running > 0);
// get content and remove handles
foreach($curl_handles as $id => $c)
{
$result[$id] = curl_multi_getcontent($c);
curl_multi_remove_handle($mh, $c);
}
// close curl
curl_multi_close($mh);
// return the resulting html
return $result;
}
/**
* Crawl trough every page and pass the result to the find function until all the keywords are processed.
*/
protected function crawl()
{
$this->setup();
$html = $this->parse();
$i = 0;
foreach($html as $single)
{
$result = $this->find($single);
if($result !== FALSE)
{
if(!isset($this->found[$this->keywords[$i]]))
{
$this->found[$this->keywords[$i]] = $this->current + $result;
// save the time it took to find the result with this keyword
$this->debug['time'][$this->keywords[$i]] = number_format(microtime(true) - $this->time_start, 3);
unset($this->keywords[$i]);
}
// remove the keyword from the haystack
unset($this->keywords[$i]);
}
$i++;
}
if(!empty($this->keywords))
{
if($this->current <= $this->limit)
{
$this->current += 10;
$this->crawl();
}
}
}
/**
* Prepare the array of the keywords for every run.
*/
protected function setup()
{
// prepare the url array for the new loop
unset($this->baseurl);
foreach($this->keywords as $keyword)
{
$url = $this->set_baseurl();
$url = str_replace("keyword", $keyword, $url);
$url = str_replace("position", $this->current, $url);
$this->baseurl[] = $url;
}
}
/**
* Start the crawl/search process.
*/
function run()
{
$this->crawl();
}
/**
* Return the results from the search.
*
* @return Array $this->found
*/
function get_results()
{
return $this->found;
}
/**
* Return the debug information - time taken, etc.
*
* @return Array $this->debug
*/
function get_debug_info()
{
return $this->debug;
}
/**
* Set up the base url for the specific search engine using "keyword" and "position" for setting up the template.
*
* @return String $baseurl;
*/
abstract function set_baseurl();
/**
* Find the occurrence of the site in the results page. Specific for every search engine.
*
* @param String $html OPTIONAL: override the default html if needed
* @return String $baseurl;
*/
abstract function find($html);
} ?>
включайте "extra.php";
<?php class GoogleTracker extends Tracker
{
function set_baseurl()
{
// use "keyword" and "position" to mark the position of the variables in the url
$baseurl = "http://www.google.com/search?q=keyword&start=position";
return $baseurl;
}
function find($html)
{
// process the html and return either a numeric value of the position of the site in the current page or FALSE
$dom = new DOMDocument();
@$dom->loadHTML($html);
$nodes = $dom->getElementsByTagName('cite');
// found is false by default, we will set it to the position of the site in the results if found
$found = FALSE;
// start counting the results from the first result in the page
$current = 1;
foreach($nodes as $node)
{
$node = $node->nodeValue;
// look for links that look like this: cmsreport.com › Blogs › Bryan's blog
if(preg_match('/\s/',$node))
{
$site = explode(' ',$node);
}
else
{
$site = explode('/',$node);
}
$urls[$current] = $site[0];
if($site[0] == $this->site)
{
$found = TRUE;
$place = $current;
}
$current++;
}
if(isset($found) && $found !== FALSE)
{
return $place;
}
else
{
return FALSE;
}
}
} ?>
а потом я использую
<?php
include 'main.php';
//which is the entire code
$test = new GoogleTracker(array('git'), 'git-scm.com', 50);
//$test->use_proxy('proxy.txt');
$test->run();
print_r($test->get_results());
echo "================<br>";
print_r($test->get_debug_info());
?>