python - Python - 条件が満たされるまでスクリプトをループさせ、ループごとに異なるプロキシアドレスを使用する

Question

私は初心者の定義です。私はPythonについて何も知らないので、助けを求めています。私は自分の欲求/ニーズに合わせて変数を変更するのに十分なコードを読むことができますが、元のコードが要求していないことをすることになると... 私は迷ってしまいます。

これが取り引きです。最初にすべてのCLサイトを検索し、特定のキーワードを含む投稿にフラグを立てるクレイグリスト(CL)フラグスクリプトを見つけました(これは、サイエントロジーに言及したすべての投稿にフラグを付けるために書かれました)。

私の一般的なエリア (437 サイトではなく 15 サイト) の CL サイトのみを検索するように変更しましたが、変更された特定のキーワードを引き続き検索します。投稿の並べ替えから CL で多くのビジネスを行っているため、CL を継続的にスパムし、並べ替えを困難にしている人々に自動的にフラグを立てたいと考えています。

スクリプトに実行させたいのは、各ループ後にプロキシサーバーを変更する基準を満たす投稿が見つからなくなるまでループすることです。そして、プロキシ/S IPアドレスを入れるスクリプト内の場所

返信をお待ちしております。

これが私が持っている変更されたコードです:

#!/usr/bin/env python
# -*- coding: utf-8 -*-


import urllib
from twill.commands import * # gives us go()

areas = ['sfbay', 'chico', 'fresno', 'goldcountry', 'humboldt', 'mendocino', 'modesto', 'monterey', 'redding', 'reno', 'sacramento', 'siskiyou', 'stockton', 'yubasutter', 'reno']

def expunge(url, area):
    page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
    page = page[page.index('<hr>'):].split('\n')[0]
    page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]

    for u in page:
        num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
        spam = 'https://post.craigslist.org/flag?flagCode=15&amppostingID='+num # url for flagging as spam
        go(spam) # flag it


print 'Checking ' + str(len(areas)) + ' areas...'

for area in ['http://' + a + '.craigslist.org/' for a in areas]:
    ujam = area + 'search/?query=james+"916+821+0590"+&catAbb=hhh'
    udre = area + 'search/?query="DRE+%23+01902542+"&catAbb=hhh'
    try:
        jam = urllib.urlopen(ujam).read()
        dre = urllib.urlopen(udre).read()
    except:
        print 'tl;dr error for ' + area

    if 'Found: ' in jam:
        print 'Found results for "James 916 821 0590" in ' + area
        expunge(ujam, area)
        print 'All "James 916 821 0590" listings marked as spam for area'

    if 'Found: ' in dre:
        print 'Found results for "DRE # 01902542" in ' + area
        expunge(udre, area)
        print 'All "DRE # 01902542" listings marked as spam for area'

score 0 · Accepted Answer

私はいくつかの変更を加えました...それらがどれだけうまく機能しているかはわかりませんが、エラーは発生していません。間違っている/足りないものを見つけたら、私に知らせてください。- ありがとう

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib, urllib2
from twill.commands import go


proxy = urllib2.ProxyHandler({'https': '108.60.219.136:8080'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
proxy2 = urllib2.ProxyHandler({'https': '198.144.186.98:3128'})
opener2 = urllib2.build_opener(proxy2)
urllib2.install_opener(opener2)
proxy3 = urllib2.ProxyHandler({'https': '66.55.153.226:8080'})
opener3 = urllib2.build_opener(proxy3)
urllib2.install_opener(opener3)
proxy4 = urllib2.ProxyHandler({'https': '173.213.113.111:8080'})
opener4 = urllib2.build_opener(proxy4)
urllib2.install_opener(opener4)
proxy5 = urllib2.ProxyHandler({'https': '198.154.114.118:3128'})
opener5 = urllib2.build_opener(proxy5)
urllib2.install_opener(opener5)


    areas = ['sfbay', 'chico', 'fresno', 'goldcountry', 'humboldt',
    'mendocino', 'modesto', 'monterey', 'redding', 'reno',
    'sacramento', 'siskiyou', 'stockton', 'yubasutter']
queries = ['james+"916+821+0590"','"DRE+%23+01902542"']

    def expunge(url, area):
page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
page = page[page.index('<hr>'):].split('\n')[0]
page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]

    for u in page:
    num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
    spam = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=15&amppostingID='+num )
    spam2 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=28&amppostingID='+num )
    spam3 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=16&amppostingID='+num )
    go(spam) # flag it
    go(spam2) # flag it
    go(spam3) # flag it

print 'Checking ' + str(len(areas)) + ' areas...'

    for area in areas:
for query in queries:
    qurl = 'http://' + area + '.craigslist.org/search/?query=' + query + '+&catAbb=hhh'
    try:
        q = urllib.urlopen(qurl).read()
    except:
        print 'tl;dr error for {} in {}'.format(query, area)
        break

    if 'Found: ' in q:
        print 'Found results for {} in {}'.format(query, area)
        expunge(qurl, area)
        print 'All {} listings marked as spam for {}'.format(query, area)
        print ''
        print ''
    elif 'Nothing found for that search' in q:
        print 'No results for {} in {}'.format(query, area)
        print ''
        print ''
        break
    else:
        break

score 0 · Accepted Answer

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib, urllib2
from twill.commands import go


proxy = urllib2.ProxyHandler({'https': '108.60.219.136:8080'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
proxy2 = urllib2.ProxyHandler({'https': '198.144.186.98:3128'})
opener2 = urllib2.build_opener(proxy2)
urllib2.install_opener(opener2)
proxy3 = urllib2.ProxyHandler({'https': '66.55.153.226:8080'})
opener3 = urllib2.build_opener(proxy3)
urllib2.install_opener(opener3)
proxy4 = urllib2.ProxyHandler({'https': '173.213.113.111:8080'})
opener4 = urllib2.build_opener(proxy4)
urllib2.install_opener(opener4)
proxy5 = urllib2.ProxyHandler({'https': '198.154.114.118:3128'})
opener5 = urllib2.build_opener(proxy5)
urllib2.install_opener(opener5)


areas = ['capecod']
queries = ['rent','rental','home','year','falmouth','lease','credit','tenant','apartment','bedroom','bed','bath']

    def expunge(url, area):
page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
page = page[page.index('<hr>'):].split('\n')[0]
page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]

    for u in page:
    num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
    spam = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=15&amppostingID='+num )
    spam2 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=28&amppostingID='+num )
    spam3 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=16&amppostingID='+num )
    go(spam) # flag it
    go(spam2) # flag it
    go(spam3) # flag it

print 'Checking ' + str(len(areas)) + ' areas...'

    for area in areas:
for query in queries:
    qurl = 'http://' + area + '.craigslist.org/search/?query=' + query + '+&catAbb=hhh'
    try:
        q = urllib.urlopen(qurl).read()
    except:
        print 'tl;dr error for {} in {}'.format(query, area)
        break

    if 'Found: ' in q:
        print 'Found results for {} in {}'.format(query, area)
        expunge(qurl, area)
        print 'All {} listings marked as spam for {}'.format(query, area)
        print ''
        print ''
    elif 'Nothing found for that search' in q:
        print 'No results for {} in {}'.format(query, area)
        print ''
        print ''
        break
    else:
        break

score 0 · Accepted Answer

次のような定数ループを作成できます。

while True:
    if condition :
        break

Itertools には、http: //docs.python.org/2/library/itertools.html を反復するためのいくつかのトリックがあります。

特に、チェックアウトitertools.cycle

(これらは正しい方向へのポインタを意味します。一方、他方、または両方で解決策を作成できます)

score 0 · Accepted Answer

コードにいくつかの変更を加えました。関数はexpungeすでにページ内のすべての結果をループしているように見えるので、どのループを作成する必要があるのかわかりませんが、最後に結果が見つかったかどうかを確認する方法の例がありますが、抜け出すループはありません。

プロキシ/IP の変更方法がわかりません。

ところで、あなたは'reno'二度持っていました。

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib
from twill.commands import go

areas = ['sfbay', 'chico', 'fresno', 'goldcountry', 'humboldt',
        'mendocino', 'modesto', 'monterey', 'redding', 'reno',
        'sacramento', 'siskiyou', 'stockton', 'yubasutter']
queries = ['james+"916+821+0590"','"DRE+%23+01902542"']

def expunge(url, area):
    page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
    page = page[page.index('<hr>'):].split('\n')[0]
    page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]

    for u in page:
        num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
        spam = 'https://post.craigslist.org/flag?flagCode=15&amppostingID='+num # url for flagging as spam
        go(spam) # flag it

print 'Checking ' + str(len(areas)) + ' areas...'

for area in areas:
    for query in queries:
        qurl = 'http://' + area + '.craigslist.org/search/?query=' + query + '+&catAbb=hhh'
        try:
            q = urllib.urlopen(qurl).read()
        except:
            print 'tl;dr error for {} in {}'.format(query, area)
            break

        if 'Found: ' in q:
            print 'Found results for {} in {}'.format(query, area)
            expunge(qurl, area)
            print 'All {} listings marked as spam for area'.format(query)
        elif 'Nothing found for that search' in q:
            print 'No results for {} in {}'.format(query, area)
            break
        else:
            break

python - Python - 条件が満たされるまでスクリプトをループさせ、ループごとに異なるプロキシ アドレスを使用する

4 に答える 4

Related

Reference

python - Python - 条件が満たされるまでスクリプトをループさせ、ループごとに異なるプロキシアドレスを使用する