スクレイプとセレンを使用するかなり巨大で機能的なコードベースを確認したい場合は、https://github.com/nicodjimenez/bus_catchersを確認してください。これはもっと簡単な例です。
# stripped down BoltBus script
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Response
from scrapy.http import TextResponse
import time
# set dates, origin, destination
cityOrigin="Baltimore"
cityDeparture="New York"
day_array=[0]
browser = webdriver.Firefox()
# we are going the day of the days of the month from 15,16,...,25
# there is a discrepancy between the index of the calendar days and the day itself: for example day[10] may correspond to Feb 7th
for day in day_array:
# Create a new instance of the Firefox driver
browser.get("http://www.boltbus.com")
# click on "region" tab
elem_0=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_lstRegion_textBox")
elem_0.click()
time.sleep(5)
# select Northeast
elem_1=browser.find_element_by_partial_link_text("Northeast")
elem_1.click()
time.sleep(5)
# click on origin city
elem_2=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_lstOrigin_textBox")
elem_2.click()
time.sleep(5)
# select origin city
elem_3=browser.find_element_by_partial_link_text(cityOrigin)
elem_3.click()
time.sleep(5)
# click on destination city
elem_4=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_lstDestination_textBox")
elem_4.click()
time.sleep(5)
# select destination city
elem_5=browser.find_element_by_partial_link_text(cityDeparture)
elem_5.click()
time.sleep(5)
# click on travel date
travel_date_elem=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_imageE")
travel_date_elem.click()
# gets day rows of table
date_rows=browser.find_elements_by_class_name("daysrow")
# select actual day (use variable day)
# NOTE: you must make sure these day elements are "clickable"
days=date_rows[0].find_elements_by_xpath("..//td")
days[day].click()
time.sleep(3)
# retrieve actual departure date from browser
depart_date_elem=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_txtDepartureDate")
depart_date=str(depart_date_elem.get_attribute("value"))
# PARSE TABLE
# convert html to "nice format"
text_html=browser.page_source.encode('utf-8')
html_str=str(text_html)
# this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
resp_for_scrapy=TextResponse('none',200,{},html_str,[],None)
# takes a "TextResponse" object and feeds it to a scrapy function which will convert the raw HTML to a XPath document tree
hxs=HtmlXPathSelector(resp_for_scrapy)
# the | sign means "or"
table_rows=hxs.select('//tr[@class="fareviewrow"] | //tr[@class="fareviewaltrow"]')
row_ct=len(table_rows)
for x in xrange(row_ct):
cur_node_elements=table_rows[x]
travel_price=cur_node_elements.select('.//td[@class="faresColumn0"]/text()').re("\d{1,3}\.\d\d")
# I use a mixture of xpath selectors to get me to the right location in the document, and regular expressions to get the exact data
# actual digits of time
depart_time_num=cur_node_elements.select('.//td[@class="faresColumn1"]/text()').re("\d{1,2}\:\d\d")
# AM or PM (time signature)
depart_time_sig=cur_node_elements.select('.//td[@class="faresColumn1"]/text()').re("[AP][M]")
# actual digits of time
arrive_time_num=cur_node_elements.select('.//td[@class="faresColumn2"]/text()').re("\d{1,2}\:\d\d")
# AM or PM (time signature)
arrive_time_sig=cur_node_elements.select('.//td[@class="faresColumn2"]/text()').re("[AP][M]")
print "Depart date: " + depart_date
print "Depart time: " + depart_time_num[0] + " " + depart_time_sig[0]
print "Arrive time: " + arrive_time_num[0] + " " + arrive_time_sig[0]
print "Cost: " + "$" + travel_price[0]
print "\n"