0

私はseo用に最適化する必要があるgwtアプリケーションを持っています(Googleのコンテンツをクロールします)、私たちのニーズを満たしていない多くのソリューションを試してきました(htmlページを返すのにかなりの時間がかかります)、トライアルは次のとおりです。

  1. htmlUnit をヘッドレス ブラウザとして使用してオンデマンドでページをクロールしようとしましたが、html コンテンツを取得するのに約 15 秒かかります (このタイミングを監査すると、このタイミングの 80% がバックグラウンド javascript を待機するループによって取られているという結果になりました) while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) ")
  2. Google リクエストの前にページをクロールし、Google がリクエストしたときに保存されたスナップショットを提供する技術 (ただし、コンテンツが頻繁に変更され、Google がこれを「CLOACKING」と見なす可能性があるため、このソリューションは絶対に便利ではありません)

なにか提案を?

クロールに使用されるコード:

public class CrawlFilter implements Filter {
    private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
        private static final long serialVersionUID = 1L;

    @Override
    public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
        return true;
    }
}

private final Logger log = Logger.getLogger(CrawlFilter.class.getName());

/**
 * Special URL token that gets passed from the crawler to the servlet
 * filter. This token is used in case there are already existing query
 * parameters.
 */
private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_=";
private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length();
/**
 * Special URL token that gets passed from the crawler to the servlet
 * filter. This token is used in case there are not already existing query
 * parameters.
 */
private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1;
private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length();

private static final long _pumpEventLoopTimeoutMillis = 30000;
private static final long _jsTimeoutMillis = 1000;
private static final long _pageWaitMillis = 200;
private static final int _maxLoopChecks = 2;

private WebClient webClient;

public void doFilter(ServletRequest request, ServletResponse response,
                     FilterChain filterChain) throws IOException, ServletException {
    // Grab the request uri and query strings.
    final HttpServletRequest httpRequest = (HttpServletRequest) request;
    final String requestURI = httpRequest.getRequestURI();
    final String queryString = httpRequest.getQueryString();
    final HttpServletResponse httpResponse = (HttpServletResponse) response;

    if ((queryString != null) && (queryString.contains(ESCAPED_FRAGMENT_FORMAT1))) {
        final int port = httpRequest.getServerPort();
        final String urlStringWithHashFragment = requestURI + rewriteQueryString(queryString);
        final String scheme = httpRequest.getScheme();
        final URL urlWithHashFragment = new URL(scheme, "127.0.0.1", port, urlStringWithHashFragment);
        final WebRequest webRequest = new WebRequest(urlWithHashFragment);

        log.fine("Crawl filter encountered escaped fragment, will open: " + webRequest.toString());

        httpResponse.setContentType("text/html;charset=UTF-8");
        final PrintWriter out = httpResponse.getWriter();
        out.println(renderPage(webRequest));
        out.flush();
        out.close();

        log.fine("HtmlUnit completed webClient.getPage(webRequest) where webRequest = " + webRequest.toString());
    } else {
        filterChain.doFilter(request, response);
    }
}

@Override
public void destroy() {
    if (webClient != null) {
        webClient.closeAllWindows();
    }
}

@Override
public void init(FilterConfig config) throws ServletException {
}

private StringBuilder renderPage(WebRequest webRequest) throws IOException {
    webClient = new WebClient(BrowserVersion.FIREFOX_17);
    webClient.getCache().clear();
    webClient.getOptions().setCssEnabled(false);
    webClient.getOptions().setJavaScriptEnabled(true);
    webClient.getOptions().setThrowExceptionOnScriptError(false);
    webClient.getOptions().setRedirectEnabled(false);
    webClient.setAjaxController(new SyncAllAjaxController());
    webClient.setCssErrorHandler(new SilentCssErrorHandler());

    final HtmlPage page = webClient.getPage(webRequest);
    webClient.getJavaScriptEngine().pumpEventLoop(_pumpEventLoopTimeoutMillis);

    int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
    int loopCount = 0;

    while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) {
        ++loopCount;
        waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);

        if (waitForBackgroundJavaScript == 0) {
            log.fine("HtmlUnit exits background javascript at loop counter " + loopCount);
            break;
        }

        synchronized (page) {
            log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount);
            try {
                page.wait(_pageWaitMillis);
            } catch (InterruptedException e) {
                log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e);
            }
        }
    }

    webClient.getAjaxController().processSynchron(page, webRequest, false);
    if (webClient.getJavaScriptEngine().isScriptRunning()) {
        log.warning("HtmlUnit webClient.getJavaScriptEngine().shutdownJavaScriptExecutor()");
        webClient.getJavaScriptEngine().shutdownJavaScriptExecutor();
    }

    final String staticSnapshotHtml = page.asXml();
    StringBuilder stringBuilder = new StringBuilder();
    stringBuilder.append("<hr />\n");
    stringBuilder.append("<center><h3>This is a non-interactive snapshot for crawlers. Follow <a href=\"");
    stringBuilder.append(webRequest.getUrl() + "\">this link</a> for the interactive application.<br></h3></center>");
    stringBuilder.append("<hr />");
    stringBuilder.append(staticSnapshotHtml);

    return stringBuilder;
}

/**
 * Maps from the query string that contains _escaped_fragment_ to one that
 * doesn't, but is instead followed by a hash fragment. It also unescapes any
 * characters that were escaped by the crawler. If the query string does not
 * contain _escaped_fragment_, it is not modified.
 *
 * @param queryString
 * @return A modified query string followed by a hash fragment if applicable.
 *         The non-modified query string otherwise.
 * @throws UnsupportedEncodingException
 */
private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException {
    int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2);
    int length = ESCAPED_FRAGMENT_LENGTH2;

    if (index == -1) {
        index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1);
        length = ESCAPED_FRAGMENT_LENGTH1;
    }

    if (index != -1) {
        StringBuilder queryStringSb = new StringBuilder();
        if (index > 0) {
            queryStringSb.append("?");
            queryStringSb.append(queryString.substring(0, index));
        }
        queryStringSb.append("#!");
        queryStringSb.append(URLDecoder.decode(queryString.substring(index
                + length, queryString.length()), "UTF-8"));
        return queryStringSb.toString();
    }

    return queryString;
}
}
4

1 に答える 1

0

HtmlUnit で静的な html をオフラインで生成することをお勧めします。更新頻度を制御します。

次に、クローラー要求をインターセプトするサーブレット フィルターに、生成済みの静的 html を返します。

于 2013-11-04T03:43:19.610 に答える