python - 複数のページのコンテンツをスクレイピングするための Python Selenium でのループ

Question

このサイトのすべての App.No をスクレイピングしています: WIPO。この Web サイトからコンテンツをスクレイピングしていますが、コードをクリックして次のページに移動しても、必要な最終ページ (100) をループできないようです。また、私が引き出すコンテンツは、最初のページのコンテンツの繰り返しです。私が経験した最も遠いループは、エラーメッセージがポップアップする前に 12 ページです。そしてなぜか、同じコードでも毎回違うページで止まるのでは？

StaleElementReferenceException: メッセージ: 古い要素参照: 要素がページドキュメントに添付されていません

これは、パスが見つからなくなったことを意味しますが、URL を変更してもウェブサイトにアクセスできませんが、別のページのパスを調べましたが、変更されていないため、移動方法がわかりません。前方。私のコードは次のようになります。誰かが助けることができれば？

class autoScraper():
def __init__(self,ep="./chromedriver",headless=False):
    options = webdriver.ChromeOptions()
    if headless: options.add_argument("--headless");
    options.add_argument("--start-maximized")
    self.driver= webdriver.Chrome(executable_path=ep,options=options);


def closeDriver(self):
    self.driver.close()
    
def next_page(self):
    # btn=self.driver.find_elements_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[2]/div/div[2]/div/a/span')
    btn=self.driver.find_elements_by_css_selector('a[title="Next Page"]')

    if len(btn)>0:
        btn[0].click()
        
def connector(self,a="https://patentscope.wipo.int/search/en/search.jsf"):
    success = False;
    try:
        self.driver.get(a)
        self.driver.find_element_by_xpath('/html/body/div[2]/div[5]/div/div[2]/form/div/div[1]/div[2]/div/div/div[1]/div[2]/button').click()
        self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[2]/div/select[1]').click() # cilck to select the num of showing IPs on a page.
        self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[2]/div/select[1]/option[4]').click()
        self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[1]/div/select[1]/option[2]').click()
        self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[2]/div/select[1]/option[4]').click()
        self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[3]/div/select[1]/option[2]').click()
        success = True     
    except Exception as e:
        print(e)
    if success:
        return success
    
def getPCT(self):
    PCT = []
    for i in range(1,201):
        no = self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[2]/div/div[1]/div/div/table/tbody/tr[%d]/td/div/div[2]/div/div[1]/span[2]/span[2]' %(i)).text
        PCT.append(no)
    
    return PCT   

def clickNextPage(self):
    self.driver.find_element_by_css_selector('a[title="Next Page"]').click()
    
if __name__ == '__main__':
   PCT=[]
   driver = autoScraper()
   if driver.connector():
       sleep(10)
       while i<100:
           i=i+1
           PCT=driver.getPCT()
           driver.clickNextPage()
       driver.next_page()
   print('The num of scraped PCTs:',len(PCT))
   try:
       os.system('mkdir ./download/')
   except:
       print('The directory is already existed.')
   finally:
       with open('./download/pct.txt','a') as f:
           for line in PCT:
               f.write(line+'\n')
       print('urls writen to ./download/pct.txt')
        

   driver.closeDriver()

python - 複数のページのコンテンツをスクレイピングするための Python Selenium でのループ

0 に答える 0

Related

Reference