1

を使用してbing検索エンジンをスクレイピングしてMechanizeいます。bing.comしかし、同じ検索クエリを実行して 1400 の結果が返された場合、プログラムで最大 200 の結果しか得られません。ここで落とし穴は何ですか?

 def generate_profiles_from_group(options={})
  raise "TypeError", "Invalid Arguments" unless options.is_a? Hash
  group = options[:group] if options.has_key? :group
  query = build_query(options)
  page = bing_search(query)
  contacts_stack = extract_contacts_from_bing_page page: page
  bing_links_stack = bing_links page
  return contacts_stack, bing_links_stack
end

def extract_contacts_from_bing_page(options)
  page = options[:page]
  company = options[:company] || nil
  title = options[:title] || nil
  stack = []
  while true 
    page.parser.search('h3 a').each do |cite|
      text = cite.text
      unless text == ""
        name_array = text.split(' ')
        if name_array.size >= 2
          name = name_array[0]+' '+name_array[1]
          unless name=~/[^a-zA-Z',\s]/i
            stack << {name: name, company: company, title: title} 
          end
        end
      end
    end
    keyw = page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "sb_pagN", " " ))]').text
    break if keyw == ""
    page = @agent.click page.link_with(text: keyw ) 
  end
  stack
end

def bing_links page
  stack = []
  while true
    page.parser.xpath('//cite').each do |cite|
      stack << cite.text  unless cite.text == ""
    end

    keyw = page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "sb_pagN", " " ))]').text
    break if keyw == ""
    sleep(10+rand(40))
    page = @agent.click page.link_with(text: keyw )
  end
  stack
end
def build_query(options)
  name = options[:name] if options.has_key? :name
  title = options[:title] if options.has_key? :title
  company = options[:company] if options.has_key? :company
  group = options[:group] if options.has_key? :group
  if name && company
    return "site:linkedin.com \"#{name}\" \"at #{company}\""
  elsif name && title
    return "site:linkedin.com \"#{name}\" \"#{title}\""
  elsif title && company
    return "site:linkedin.com/ \"#{title}\" \"at #{company}\""
  elsif group
    return "site:linkedin.com \"groups and association\" + \"#{group}\"" 
  end
end
4

0 に答える 0