python - Python と mechanize でドロップボックスイベントページを解析しようとすると 403 エラーが発生する

Question

このスクリプトを使用して、特定のディレクトリに対するすべてのファイル更新のリストを取得します。次に、そのリストを解析して、そのディレクトリでアクティブだったタイムスロットのリストを取得します。そうすれば、プロジェクトに費やした時間をすばやく確認し、クライアントに請求する金額を知ることができます.

これから適応した小さなpythonスクリプトを作成しました：https://github.com/jncraton/PythonDropboxUploader

https://www.dropbox.com/events?ns=false&n=50から特定のイベントページを取得する下部関数を追加しました。

私は 2 か月前にスクリプトを使用しており、うまく機能していましたが、今では 403: 禁止されたエラーが発生しています:

eventSrc = self.browser.open(req).read()

おそらく、DropBox は私のようなスクレイパーをブロックして、代わりにプログラマーに API を使用するよう促そうとしますが、残念ながら API はイベントの一覧表示をサポートしていません。

誰かがそれを再び機能させるために私を助けることができますか?

これは、接続を作成するための Python コードです。

import mechanize
import urllib
import re
import json

class DropboxConnection:
""" Creates a connection to Dropbox """

email = ""
password = ""
root_ns = ""
token = ""
browser = None

def __init__(self, email, password):
    self.email = email
    self.password = password

    self.login()
    self.get_constants()

def login(self):
    """ Login to Dropbox and return mechanize browser instance """

    # Fire up a browser using mechanize
    self.browser = mechanize.Browser()

    self.browser.set_handle_equiv(False)
    self.browser.set_handle_redirect(True)
    self.browser.set_handle_referer(True)
    self.browser.set_handle_robots(False)

    self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:14.0) Gecko/20120722 Firefox/14.0.1')]

    # Browse to the login page
    self.browser.open('https://www.dropbox.com/login')

    # Enter the username and password into the login form
    isLoginForm = lambda l: l.action == "https://www.dropbox.com/login" and l.method == "POST"

    try:
        self.browser.select_form(predicate=isLoginForm)
    except:
        self.browser = None
        raise(Exception('Unable to find login form'))

    self.browser['login_email'] = self.email
    self.browser['login_password'] = self.password
    self.browser['t'] = "1230"

    # Send the form
    response = self.browser.submit()

def get_constants(self):
    """ Load constants from page """

    home_src = self.browser.open('https://www.dropbox.com/home').read()

    try:
        self.root_ns = re.findall(r"root_ns: (\d+)", home_src)[0]
        self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
    except:
        raise(Exception("Unable to find constants for AJAX requests"))

def upload_file(self, local_file, remote_dir, remote_file):
    """ Upload a local file to Dropbox """

    if(not self.is_logged_in()):
        raise(Exception("Can't upload when not logged in"))

    self.browser.open('https://www.dropbox.com/')

    # Add our file upload to the upload form
    isUploadForm = lambda u: u.action == "https://dl-web.dropbox.com/upload" and u.method == "POST"

    try:
        self.browser.select_form(predicate=isUploadForm)
    except:
        raise(Exception('Unable to find upload form'))

    self.browser.form.find_control("dest").readonly = False
    self.browser.form.set_value(remote_dir, "dest")
    self.browser.form.add_file(open(local_file, "rb"), "", remote_file)

    # Submit the form with the file
    self.browser.submit()

def get_dir_list(self, remote_dir):
    """ Get file info for a directory """

    if(not self.is_logged_in()):
        raise(Exception("Can't download when not logged in"))

    req_vars = "ns_id=" + self.root_ns + "&referrer=&t=" + self.token

    req = urllib2.Request('https://www.dropbox.com/browse' + remote_dir, data=req_vars)
    req.add_header('Referer', 'https://www.dropbox.com/home' + remote_dir)

    dir_info = json.loads(self.browser.open(req).read())

    dir_list = {}

    for item in dir_info['file_info']:
        # Eliminate directories
        if(item[0] == False):
            # get local filename
            absolute_filename = item[3]
            local_filename = re.findall(r".*\/(.*)", absolute_filename)[0]

            # get file URL and add it to the dictionary
            file_url = item[8]
            dir_list[local_filename] = file_url

    return dir_list

def get_download_url(self, remote_dir, remote_file):
    """ Get the URL to download a file """

    return self.get_dir_list(remote_dir)[remote_file]

def download_file(self, remote_dir, remote_file, local_file):
    """ Download a file and save it locally """

    fh = open(local_file, "wb")
    fh.write(self.browser.open(self.get_download_url(remote_dir, remote_file)).read())
    fh.close()

def is_logged_in(self):
    """ Checks if a login has been established """
    if(self.browser):
        return True
    else:
        return False

def getEventsPage(self, n):
    if(not self.is_logged_in()):
        raise(Exception("Can't get event page when not logged in"))

    url = 'https://www.dropbox.com/next_events'
    values = {'cur_page': n, 'ns_id': 'false'}
    data = urllib.urlencode(values)
    req = mechanize.Request(url, data)

    # print url + '?' + data

    eventSrc = self.browser.open(req).read()
    return eventSrc

そして、これはイベントページを解析するループです。

from dbupload import DropboxConnection
from getpass import getpass
from bs4 import BeautifulSoup
import re
import parsedatetime.parsedatetime as pdt
import parsedatetime.parsedatetime_consts as pdc
c = pdc.Constants()
p = pdt.Calendar(c)

email = "myemail@gmail.com"  # raw_input("Enter Dropbox email address:")
password = getpass("Enter Dropbox password:")

dateFile = open('all_file_updates.txt', "wb")
try:
    # Create the connection
    conn = DropboxConnection(email, password)
except:
    print("Connection failed")
else:
    print("Connection succesful")

n = 250
found = 0
while(n >= 0):
    eventsPageSrc = conn.getEventsPage(n)
    soup = BeautifulSoup(eventsPageSrc)

    table = soup.find("table", {"id": "events"})
    for row in table.findAll('tr'):
        link = row.find("a", href=re.compile('^https://dl-web.dropbox.com/get/ProjectName'))
        if(link != None):
            dateString = row.find("td", attrs={'class': 'modified'}).string
            date = p.parse(dateString)
            dateFile.write('Date: ' + str(date) + '    file: ' + link.string + '\n')
            found = found + 1
    n = n - 1
    print 'page: ' + str(n) + ' Total found: ' + str(found)

score 0 · Accepted Answer

デフォルトで get_constants(self): 変更

self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]

に

self.token = re.findall(r'TOKEN: "(.+)"', home_src)[0]

ドロップボックスは定数の保存方法を変更しました

それが役に立てば幸い。

python - Python と mechanize でドロップボックス イベント ページを解析しようとすると 403 エラーが発生する

1 に答える 1

Related

Reference

python - Python と mechanize でドロップボックスイベントページを解析しようとすると 403 エラーが発生する