1

Chrome ウェブストアで入手できる人気のある拡張機能によって獲得されたレビューに関するデータの収集/スクレイピングに興味があります。

特に、特定の拡張機能に残されたレビューの総数を取得してから、このアドオンで公開されているすべてのレビューを取得する必要があります。私の問題は次のとおりです。関心のあるデータはjsonリクエストを介して利用できるため、標準のPHP Curlスクレーパーを作成できません。特に、呼び出す必要があります。

私はこれを書いてみました:

 <script src="http://code.jquery.com/jquery-latest.js"></script>
 <script type="text/javascript">
  function getReviews(extensionId, callback) {
    var entities = [{'url' : 'http://chrome.google.com/extensions/permalink?id=' +   extensionId}];
    var param = {"searchSpecs":[{"requireComment":true,"entities": entities,"groups":["public_comment"],"matchExtraGroups":true,"sortBy":"quality","startIndex":10,"numResults":10,"includeNickNames":true}],"applicationId":94};

   $.ajax({
      type: 'POST',
      url: 'https://chrome.google.com/reviews/json/search',
      contentType: 'application/xml',
      xhrFields: {withCredentials: true },
      dataType: 'json',
      data: 'req=' + JSON.stringify(param) + '&requestSource=widget'
         }).success(callback);
   }
</script>

<script type="text/javascript">
 $(document).ready(getReviews('gighmmpiobklfepjocnamgkkbiglidom', function(reviews) {    console.log(reviews); }));
</script>

私は jQuery/JSON(-P) にあまり熱心ではなく、上記のコードは確かに間違っています。

私の質問は次のとおりです。

  1. 同一ドメイン ポリシーをバイパスする方法は? YQLを試してみましたが成功しませんでした...
  2. chrome.google.com/reviews/components のレビュー数 ('numRatings') と chrome.google.com/reviews/json/ のレビュー ('comments') のみを取得するように URL/'data' をフォーマットする方法ID で識別される特定の拡張子を検索します。例: gighmmpiobklfepjocnamgkkbiglidom?

PHP を使用して人気のある Mozilla Addons のこの種のスクレイピングを既に実行し、標準の curl/XPath を使用して必要なデータを収集しました。

ご協力いただきありがとうございます!

4

2 に答える 2

0

1) 最も簡単な方法は、Chrome 拡張機能を作成することです。

2) https://github.com/xpressyoo/MyExtensionsを参照

[...]
getComments : function() {
        var entities = [];
        //each(Ext.extensions, function(data, id) {
            entities.push({'url' : 'http://chrome.google.com/extensions/permalink?id=' + this.hash});
        //});               

        Ext.XHR['comments'] = new Ajax({
            'method'        : 'POST',
            'encodeURI'     : false,    // Needed
            'url'           : 'https://chrome.google.com/reviews/json/search',
            'headers'       : {
                'Content-type'  : 'application/xml'
            },

            'parameters'    : {
                'req'       : JSON.stringify({'searchSpecs' :  [{'entities' : entities, 'groups' : ['public_comment'], 'matchExtraGroups' : true,"sortBy":"quality", 'startIndex' : 0, 'numResults' : 80, 'includeNickNames' : true}], 'applicationId' : 94 }) + '&requestSource=widget'
            },

            'onSuccess'     : function(xhr) {
                var json = xhr.responseJSON;
                if(json && json.searchResults ) {
                    this.comments = {

'total'             : Number(json.searchResults[0].numAnnotations.toString().replace(/,/, '').toInt()),
'latest'            : json.searchResults[0].annotations ? json.searchResults[0].annotations[0] :{},
'previous'          : this.comments.total || null,
'latestPrevious'            : $merge(this.comments.latest) || null,
'new'                   : this.comments['new'] || false
                    }
Ext.XHR['comments'] = null;
                }
            }.bind(this)    
        }).send();      

        return this;
    },
 [...]

var nbreviews = this.comments.total; //The number of reviews

var latestcomment = (this.comments.latest0 && this.comments.latest0.comment ? this.comments.latest0.comment.replace(/\n/gi, '')  : '');// get the latest comment
var nthcomment = (this.comments.latestn && this.comments.latestn.comment ? this.comments.latestn.comment.replace(/\n/gi, '')  : '');//Get the nth comment

どこ:

'latestn'           : json.searchResults[0].annotations ? json.searchResults[0].annotations[n] :{},
于 2011-09-10T02:18:33.840 に答える
0

これは、並列 cURL を使用して PHP で行う方法です。このスクリプトは、Chrome Web ストアにあるすべての拡張機能 (人気順) をスクレイピングし、次のような情報を取得します。

  • 利用者数
  • 星評価の数
  • テキストレビュー数
  • 各テキスト レビューの文字数 (拡張機能ごとにスクレイピングされた最大 100 レビュー)
//GET URL
$url0 = "https://chrome.google.com/";

//AUTO LOOP
foreach(range(0, 705, 5) as $x) {

//Nb PAGES TO DOWNLOAD
$frompge = $x+1;
$topge   = $x+5;
$nbpages = ($topge - $frompge)+1;
$zitems  = $nbpages*20;

//MULTI cURL INIT
$mh      = curl_multi_init();
$running = null;

//GENERATE URLs ARRAY
$urls    = array();

for ($a = $frompge; $a <= $topge; $a++){
     $aa = $url0 . 'webstore/list/most_popular/'. $a .'?category=ext';
     $urls[] = $aa;
}


foreach ($urls as $name => $url) 
{
        $c[$name]=curl_init($url);
        curl_setopt($c[$name], CURLOPT_HEADER, false);
        curl_setopt($c[$name], CURLOPT_FAILONERROR, true);
        curl_setopt($c[$name], CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($c[$name], CURLOPT_AUTOREFERER, true);
        curl_setopt($c[$name], CURLOPT_RETURNTRANSFER, true);
        curl_setopt($c[$name], CURLOPT_TIMEOUT, 10);
        curl_multi_add_handle ($mh,$c[$name]);
}

// execute all queries simultaneously, and continue when all are complete
do {
    curl_multi_exec($mh, $running);
    } while ($running >0);

$html = array();
foreach ($urls as $name => $url) 
{
    $html[]=curl_multi_getcontent($c[$name]);
    curl_multi_remove_handle($mh,$c[$name]);
    curl_close($c[$name]);
}
curl_multi_close($mh);

for ($b = 0; $b <= $nbpages-1; $b++) {

// Parse the HTML information and return the results.
$dom = new DOMDocument(); 
@$dom->loadHtml($html[$b]);

$xpath = new DOMXPath($dom);
$links = $xpath->query("//a[contains(@class, 'title-a')]");

$result = array();

foreach ( $links as $item ) {
    $newDom = new DOMDocument;
    $newDom->appendChild($newDom->importNode($item,true));

    $xpath = new DOMXPath( $newDom );
    $cleaner = array(" users", " user", "(", ")", ","," ");

$data = str_replace($cleaner,"",trim($xpath->query("//script")->item(0)->nodeValue));

list($b1,$id,$b2,$b3,$b4,$name,$b5,$b6,$b7,$b8,$b9,$b10,$b11,$b12,$b13,$nbusers) = explode("\"", $data);
$label = str_replace(" ", "", strtolower(ereg_replace("[^A-Za-z0-9 ]", "", $name)));

//CATEGORIES (based on nb of users)
        if($nbusers<100){$category = '1';$color = 'inherit';}
        else if($nbusers>=100 && $nbusers<1000){$category = '2';$color = '#E6EEEE';}
        else if($nbusers>=1000 && $nbusers<10000){$category = '3';$color = '#CDDEDE';}
        else if($nbusers>=10000 && $nbusers<100000){$category = '4';$color = '#B5CDCD';}
        else if($nbusers>=100000 && $nbusers<1000000){$category = '5';$color = '#9CBDBD';}
        else if($nbusers == '1000000+'){$category = '6';$color = '#83ACAC';}
        else{$category = '-9';}

/////////////////////////////////////////////LOOP REVIEWS

$extURL = 'http://chrome.google.com/extensions/permalink?id='.$id;
$c1     = curl_init('https://chrome.google.com/reviews/json/search');
$c1a    = curl_init('https://chrome.google.com/reviews/json/search');
$c2     = curl_init('https://chrome.google.com/reviews/json/lookup');

$fields1 = http_build_query(array(
    'req' => '{"searchSpecs":[{"requireComment":true,"entities":[{"url":"'.$extURL.'"}],"groups":["public_comment"],"matchExtraGroups":true,"sortBy":"quality","startIndex":0,"numResults":100,"includeNickNames":false}],"applicationId":94}',
));
$options1 = array(
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_SSL_VERIFYPEER => false,
    CURLOPT_POST => true,
    CURLOPT_TIMEOUT => 10,
    CURLOPT_POSTFIELDS => $fields1,
);

$fields1a = http_build_query(array(
    'req' => '{"searchSpecs":[{"requireComment":true,"entities":[{"url":"'.$extURL.'"}],"groups":["public_comment"],"matchExtraGroups":true,"startIndex":0,"numResults":100,"includeNickNames":false}],"applicationId":94}',
));
$options1a = array(
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_SSL_VERIFYPEER => false,
    CURLOPT_POST => true,
    CURLOPT_TIMEOUT => 10,
    CURLOPT_POSTFIELDS => $fields1a,
);

$fields2 = http_build_query(array(
    'req' => '{"entities":[{"url" : "'.$extURL.'", "includeAggregateInfo" : true}],"applicationId":94}',
));
$options2 = array(
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_SSL_VERIFYPEER => false,
    CURLOPT_POST => true,
    CURLOPT_TIMEOUT => 10,
    CURLOPT_POSTFIELDS => $fields2,
);

curl_setopt_array($c1, $options1);
curl_setopt_array($c1a, $options1a);
curl_setopt_array($c2, $options2);

$mh2 = curl_multi_init();

curl_multi_add_handle($mh2,$c1);
curl_multi_add_handle($mh2,$c1a);
curl_multi_add_handle($mh2,$c2);

$active = null;

do {
curl_multi_exec($mh2, $active);
    } while ($active >0);

//close the handles$c1 = curl_init('https://chrome.google.com/reviews/json/search');
$json1=curl_multi_getcontent($c1);
$json1a=curl_multi_getcontent($c1a);
$json2=curl_multi_getcontent($c2);
curl_multi_remove_handle($mh2, $c1);
curl_multi_remove_handle($mh2, $c1a);
curl_multi_remove_handle($mh2, $c2);
curl_multi_close($mh2);

$data1 = json_decode(utf8_encode($json1), true);
$data1a = json_decode(utf8_encode($json1a), true);
$data2 = json_decode(utf8_encode($json2), true);

if ($data1['channelHeader']['errorCode']) return;
$nbreviews = $data1['searchResults'][0]['numAnnotations'];
if ($nbreviews > 100){$nbreviews2=100;}
else{$nbreviews2=$nbreviews;}

//Sum strings
$comments = $data1['searchResults'][0]['annotations'];
$sum =0;
foreach($comments as $comment){
    $msg = preg_replace('/[\n\r\t]/', ' ', htmlspecialchars($comment['comment']));
    $msg = str_replace("&gt;", "", $msg);
    $msg = str_replace(" ", "", $msg);
    $strlen = strlen($msg);
    $sum += $strlen;
}
$add = $sum;
$final = $add/$nbreviews2;

//Sum strings A
if ($data1a['channelHeader']['errorCode']) return;
$nbreviewsa = $data1a['searchResults'][0]['numAnnotations'];
$commentsa = $data1a['searchResults'][0]['annotations'];
$suma =0;
foreach($commentsa as $commenta){
$msga = preg_replace('/[\n\r\t]/', ' ', htmlspecialchars($commenta['comment']));
$msga = str_replace("&gt;", "", $msga);
$msga = str_replace(" ", "", $msga);
$strlena = strlen($msga);
$suma += $strlena;
}
$adda = $suma;
$finala = $adda/$nbreviews2;

//Ratings
if ($data2['channelHeader']['errorCode']) return;
$nbratings = $data2['annotations'][0]['aggregateInfo']['numRatings'];
$nbstars = $data2['annotations'][0]['aggregateInfo']['averageRating'];

$delta = $nbratings - $nbreviews;
$ratio = $nbratings/$nbusers;
$ratio2 = $nbreviews/$nbusers;
////////////////////////////////////////////END LOOP REVIEWS

//PUT VALUES TOGETHER
    $result[] = array($name,$label,$id,$category,$nbusers,$nbratings,$nbreviews,$nbreviewsa,$delta,$ratio,$ratio2,$nbstars,$nbreviews2,$add,$final,$adda,$finala);
}//END FOREACH
//print_r($result,false);

//DISPLAY RESULTS

for ($z = 0; $z <= 20; $z++) {

echo "<tr><td class=\"non\">" .$result[$z][0] . "</td><td class=\"non\">" .$result[$z][1] . "</td><td>" .$result[$z][3] . "</td><td>" .$result[$z][4] . "</td><td>" .$result[$z][5] . "</td><td>" .$result[$z][6] . "</td><td>" .$result[$z][7] . "</td><td>" .$result[$z][8] . "</td><td>" .$result[$z][9] . "</td><td>" .$result[$z][10] . "</td><td>" .$result[$z][11] . "</td><td>" .$result[$z][12] . "</td><td>" .$result[$z][13] . "</td><td>" .$result[$z][14] . "</td><td>" .$result[$z][15] . "</td><td>" .$result[$z][16] . "</td></tr>";
ob_flush();
flush();
}

}
}//END FOREACH
于 2012-04-16T12:00:44.153 に答える