import imdb
ia = imdb.IMDb()
avatar = ia.get_movie("0120667")
ia.update(avatar, 'business')
print avatar['business']
これにより、グロスの全リストと各国の上映が返されます。しかし、どうすればスクリーニング情報だけを取得できますか? しかも1カ国限定。この例では、取得したい情報は (USA) (2005 年 7 月 10 日) (3,602 画面) です。
import imdb
import re
ia = imdb.IMDb()
avatar = ia.get_movie("0120667")
ia.update(avatar, 'business')
opening_weekends = avatar['business']['opening weekend']
def parseDate(date):
result = {}
if re.match(".*\d{4}$", date):
result['year'] = date[-4:]
m = re.match(".*(?P<month>January|February|March|April|May|June|July|"
"August|September|October|November|December).*", date, re.I)
if m:
result['month'] = m.group('month').lower()
# try to grab date too then
daymatch = re.match("^(?P<day>\d{1,2}).*", date)
if daymatch:
result['day'] = daymatch.group('day')
return result
def parseBudget(amount):
"""
assumptions:
- currency is always before the number
- no fractions
"""
# find index first number
for i in range(len(amount)):
if amount[i] in "0123456789":
amount_idx = i
break
currency = amount[:amount_idx].strip()
amount = re.sub("\D", "", amount[amount_idx:])
return amount, currency
def parseWeekendGross(gross_text):
g = gross_text.split(' (')
if not len(g) == 4:
return ""
amount, currency = parseBudget(g[0])
country = g[1].lstrip('(').rstrip(')')
date = parseDate(g[2].lstrip('(').rstrip(')'))
day, month, year = date['day'], date['month'], date['year']
screens = re.sub("\D", "", g[3])
if not screens:
screens = "''"
return amount, currency, country, day, month, year, screens
for entry in opening_weekends:
amount, currency, country, day, month, year, screens = parseWeekendGross(entry)
if country == "USA":
print("Country: %s" % country)
print("Date: %s %s %s" % (day, month, year))
print("Screens: %s" % screens)
break
上記のコードにより、次の結果が得られます。
Country: USA
Date: 10 july 2005
Screens: 3602
データを解析する関数は、このプロジェクトからコピーされます: pyIRDG