0から10 000 000までのすべてのページの情報を1つずつ取得するクローラーを作成したいと考えています。いくら時間がかかってもかまいません。私はそれが機能することを望んでいます。ここに私が得るエラーがあります
致命的なエラー: 関数の最大ネスト レベル '100' に達しました。中止します! D:\wamp\www\crawler\index.php の 25 行目
25行目は
$htmlstr = (string)$this->curlGet($url);
そして、私の完全なスクリプトがあります。
ご協力ありがとうございました !
header('Content-Type: text/html; charset=utf-8');
ini_set('max_input_nesting_level','100000');
ini_set('max_execution_time','100000');
class crawler{
private $url;
private $page;
private $bothurl;
private $innerDom = null;
public $prop;
public $entry;
function __construct($entry){
$this->entry = $entry;
$this->bothurl = array('http://www.remax-quebec.com/fr/inscription/Q/'.$entry.'.rmx','http://www.remax-quebec.com/en/inscription/Q/'.$entry.'.rmx');
$this->scan();
}
private function scan(){
$i =0;
foreach($this->bothurl as $url){
$this->url = $url;
$this->lang = ($i==0)?'fr':'en';
$htmlstr = (string)$this->curlGet($url);
$dom = new DOMDocument;
@$dom->loadHTML($htmlstr);
$this->page = $dom;
$this->htmlInfos();
$this->getInfos();
$i++;
}
}
private function htmlInfos(){
$divs = $this->page->getElementsByTagName('div');
foreach($divs as $div){
if($div->hasAttribute('class') && $div->getAttribute('class') == 'bloc specs'){
$innerDom = new DOMDocument();
@$innerDom->loadHTML($this->innerHTML($div));
$this->innerDom = $innerDom;
}
}
if($this->innerDom === null) $this->changeEntry();
}
private function getInfos(){
$sect = 0;
foreach($this->innerDom->getElementsByTagName('div') as $div){
# obtenir la description
$this->getDesc($div->getAttribute('class'),$div);
# obtenir les caractéristiques
$this->getCaract($div->getAttribute('class'),$div);
# obtenir les informations interieur, exterieur et evaluation
if($div->getAttribute('class') == 'section deux-colonnes'){
switch($sect){
case 0: $this->getSpecInfos($div,'interieur'); break;
case 1: $this->getSpecInfos($div,'exterieur'); break;
case 2: $this->getSpecInfos($div,'evaluation'); break;
case 3: $this->getSpecInfos($div,'equipement'); break;
case 4: $this->getSpecInfos($div,'services'); break;
}
$sect++;
}else if($div->getAttribute('class') == 'section'){
# obtenir les détails des pièces
foreach($div->getElementsByTagName('table') as $table){
if($table->getAttribute('class') == 'details-pieces'){
$this->detailPieces($table);
}
}
}
}
}
private function getDesc($class,$obj){
if($class == 'section description'){
$p = $obj->getElementsByTagName('p')->item(0);
$text = (string)$p->nodeValue;
$this->prop[$this->lang]['description'] = $text;
}
}
private function getCaract($class,$obj){
if($class == 'section characteristiques'){
foreach($obj->getElementsByTagName('div') as $div){
if(substr($div->getAttribute('class'),0,4) == "item"){
$text = (string)$div->nodeValue;
$this->prop[$this->lang]['caracteritiques'][substr($div->getAttribute('class'),5)] = $text;
}
}
}
}
private function getSpecInfos($obj,$nomInfo){
foreach($obj->getElementsByTagName('table') as $table){
foreach($table->getElementsByTagName('tr') as $tr){
$name = $tr->getElementsByTagName('td')->item(0);
$value = $tr->getElementsByTagName('td')->item(1);
$name = substr((string)$name->nodeValue,0,-2);
$value = (string)$value->nodeValue;
$this->prop[$this->lang][$nomInfo][$this->noAccents($name)] = $value;
}
}
}
private function detailPieces($obj){
$tbody = $obj->getElementsByTagName('tbody')->item(0);
foreach($tbody->getElementsByTagName('tr') as $tr){
$name = $tr->getElementsByTagName('td')->item(0);
$name = (string)$name->nodeValue;
$level = $tr->getElementsByTagName('td')->item(1);
$level = (string)$level->nodeValue;
$dimensions = $tr->getElementsByTagName('td')->item(2);
$dimensions = (string)$dimensions->nodeValue;
$floor = $tr->getElementsByTagName('td')->item(3);
$floor = (string)$floor->nodeValue;
$desc = $tr->getElementsByTagName('td')->item(4);
$desc = (string)$desc->nodeValue;
$this->prop[$this->lang]['pieces'][$this->noAccents($name)]['etage'] = $level;
$this->prop[$this->lang]['pieces'][$this->noAccents($name)]['dimensions'] = $dimensions;
$this->prop[$this->lang]['pieces'][$this->noAccents($name)]['revetement'] = $floor;
$this->prop[$this->lang]['pieces'][$this->noAccents($name)]['description'] = $desc;
}
}
private function innerHTML($element){
$innerHTML = "";
$children = $element->childNodes;
foreach ($children as $child)
{
$tmp_dom = new DOMDocument();
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
$innerHTML.=trim($tmp_dom->saveHTML());
}
return $innerHTML;
}
private function noAccents($value){
$string= strtr($chaine,"ÀÁÂÃÄÅàáâãäåÒÓÔÕÖØòóôõöøÈÉÊËèéêëÇçÌÍÎÏìíîïÙÚÛÜùúûüÿÑñ","aaaaaaaaaaaaooooooooooooeeeeeeeecciiiiiiiiuuuuuuuuynn");
}
private function changeEntry(){
$this->entry++;
echo $this->entry;
$this->scan();
}
private function curlGet($url){
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_ENCODING, "gzip");
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($curl);
curl_close($curl);
return $data;
}
}
$entry = 8678057;
$crawler = new crawler($entry);
echo '<pre>';
print_r($crawler->prop);
echo '</pre>';