0

フィードをスクレイピングするための一般的な PhantomJS セットアップがあります (所有者の許可を得て、クライアントのためにこれを行っています)。URL、ページをめくるための jQuery/javascript コード、およびフィードからリンクを選択するためのセレクターを指定します。

このフィードは、ページをめくるボタンを除いてすべて正常に読み込まれているようです [画像を参照]。

PhantomJS によるレンダリング:

「ページをめくる」ボタンがない PhantomJS 写真

コンピューターの Chrome でレンダリング:

ここに画像の説明を入力

私は1日以上困惑しています。

どんな助けでも大歓迎です。

私のコード:

var page = new WebPage({
                  settings: {
                    loadPlugins: true,
                    userAgent : "Mozilla/5.0 (X11; Linux i686) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5",
                    XSSAuditingEnabled: false,
                    webSecurityEnabled: false
                  },
                  viewportSize: { width: 1366, height: 768 }
               }),
    output_phantom = {errors: [], results: null};

var turn_page_jquery    = "$('#yui-pg0-0-next-link').click_link()",
    url_selector_jquery = "$('h3 a').multiAttr('href')",
    url                 = "http://www.springcjd.com/new/search?dpt=2#QryString=%3FlogSearch%3Dfalse%26sortCol%3Dnull%26sortType%3Dnull%26VIN%3Dnull%26dealerStockID%3Dnull%26stockType%3D2%26year%3Dnull%26make%3Dnull%26model%3Dnull%26subModel%3Dnull%26body%3Dnull%26minMileage%3Dnull%26maxMileage%3Dnull%26numOfDoors%3Dnull%26certified%3Dnull%26minDFList%3Dnull%26maxDFList%3Dnull%26onLotAfter%3Dnull%26onLotBefore%3Dnull%26pageNum%3D1%26carsPerPage%3D30%26fullText%3Dnull%26%26mpghmn%3Dnull%26%26lotID%3Dnull%26daysOnLotMin%3Dnull%26%26output%3Djson%22";

// Start things going here
get_links_from_pages(url, url_selector_jquery, turn_page_jquery);

// Allows you to pass args to function in page
function evaluate(page, func) {
    var args = [].slice.call(arguments, 2);
    var fn   = "function() { return (" + func.toString() + ").apply(this, " + JSON.stringify(args) + ");}";
    return page.evaluate(fn);
}

page.onError = function (msg, trace) {
    console.log(msg);
    trace.forEach(function(item) {
        console.log('  ', item.file, ':', item.line);
    });
};

// Communicator between phantomJS and the page.
page.onConsoleMessage = function(msg) 
{
    var msg_json = JSON.parse(msg);

    if(msg_json && msg_json.type)
    {
        var type    = msg_json.type;
        var message = msg_json.message;

        switch(type)
        {
            case 'return_value':
                output_phantom.results = message;
                console.log(JSON.stringify(output_phantom)); 
                phantom.exit(); 
                break;
            case 'message':
                output_phantom.errors.push(message);
                break;
            case 'exit': 
                phantom.exit();
                break;
            case 'render': 
                var photo_name = 'phantom_test.png';
                if(message != '') 
                    photo_name = message;
                page.render(photo_name);
                break;
        }
    }
    else
       output_phantom.errors.push(msg);
};

function inject_scripts()
{
    // Inject jquery and our additional script if they don't exist
    // Would like to be able to overwrite an older version of jQuery
    if(page.evaluate(function () { return typeof jQuery;}) == 'undefined')
    {
        if(page.evaluate(function () { return typeof $;}) == 'function') {
            console.log("'$' symbol already used.");
        }
        else
        {
            if (!page.injectJs("/../../jquery.js")) {
                console.log("jQuery not loaded...");
                phantom.exit();                
            }
        }
    } 

    // Inject scripts that allow communication using fn onConsoleMessage
    if (!page.injectJs("/../lf_additional.js")) {
        console.log("Additional scripts not loaded...");
        phantom.exit();                
    }
}

function run_phantom(url, fn/* args here, just not seen */)
{
    var extra_args = [].slice.call(arguments, 2);

    page.open(url, function (status) {
        // if (status !== 'success') {
        //     console.log('Unable to load the url! (URL: '+url+')');
        //     phantom.exit();
        // }
        // else 
        // {
            inject_scripts();
            output_phantom = {errors: [], results: null}; 

            // run our js code inside the headless browser.
            extra_args.unshift(page, fn);
            evaluate.apply(this, extra_args);
        // }
    });
}

function get_links_from_pages(url, url_selector_jquery, turn_page_jquery)
{
    var fn = function(url_selector_jquery, turn_page_jquery) 
             {
                var results = [];
                var i = 0; 

                var interval = setInterval(function() 
                    {
                        // Photograph page right before selecting values
                        phantom_render('ph_scjd_'+i+'.png');

                        var selected_data = eval(url_selector_jquery);

                        //
                        results.push(selected_data);

                        // Try to turn the page
                        eval(turn_page_jquery);

                        // Get the first 4 pages
                        if(i >= 3) {
                            phantom_return(results);
                            clearInterval(interval);
                        }
                        i++;

                    }, 3000);
             };
    return run_phantom(url, fn, url_selector_jquery, turn_page_jquery);
}

//////////////////////////////////////////////////////////
// Other stuff

// Improved json parsing
(function() {

    var parse = JSON.parse;

    JSON = {

        stringify: JSON.stringify,

        validate: function(str) {

            try {
                parse(str);
                return true;
            } catch(err){
                return err;
            }
        },

        parse: function(str) {

            try {
                return parse(str);
            } catch(err){
                return undefined;
            }
        }
    }
})();

挿入されたファイル「additional.js」:

/* 
 *  These are additional scripts intended to make selection of multiple elements
 *  much more concise.
 */

$.fn.click_link = function() {
     simulateMouseClick(this.selector);
};

$.fn.collect = function(fn) {
    var values = [];

    if (typeof fn == 'string') {
        var prop = fn;
        fn = function() { return this.attr(prop); };
    }

    $(this).each(function() {
        var val = fn.call($(this));
        values.push(val);
    });
    return values;
};

$.fn.multiAttr = function(attrName) {
    return this.collect(attrName);
};

// .text() should be pretty close, except concatenated?
$.fn.multiHtml = function() {
    var val_array = this.collect(function() { return this.html(); });
    return val_array;
};

$.fn.multiVal = function() {
    return this.multiAttr('value');
};

// The commented out code is much more concise, but probably less efficient
// $(arr1).not(arr2).length == 0 && $(arr2).not(arr1).length == 0
jQuery.extend({
    compareArray: function (arrayA, arrayB) {
        if (arrayA.length != arrayB.length) { return false; }
        // sort modifies original array
        // (which are passed by reference to our method!)
        // so clone the arrays before sorting
        var a = jQuery.extend(true, [], arrayA);
        var b = jQuery.extend(true, [], arrayB);
        a.sort(); 
        b.sort();
        for (var i = 0, l = a.length; i < l; i++) {
            if (a[i] !== b[i]) { 
                return false;
            }
        }
        return true;
    }
});

function simulateMouseClick(selector) { 
    var targets = document.querySelectorAll(selector), 
        evt = document.createEvent('MouseEvents'), 
        i, len; 
    evt.initMouseEvent("click", true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null); 

    for ( i = 0, len = targets.length; i < len; ++i ) { 
        targets[i].dispatchEvent(evt);     
    }
}

function send_console_command(type, message)
{
    var msg = {};

    if(!message) message = '';

    msg.message    = message;
    msg.type       = type;
    msg.validation = 'phantom_js_communicator';

    console.log(JSON.stringify(msg));
}

function phantom_exit() {
    send_console_command('exit');
}

function phantom_message(msg) {
    send_console_command('message', msg);
}

function phantom_return(return_val) {
    send_console_command('return_value', return_val);
}

function phantom_render(photo_name) {
    send_console_command('render', photo_name);
}
4

1 に答える 1

0

そうじゃないですか

var page = require('webpage').create();
page.viewportSize = { width: 1366, height: 768 };

ドキュメントを読んだだけで、テストされていません。

于 2012-06-13T14:54:47.450 に答える