python - PythonCSV行値ベースのフロー制御

Question

次の構造のCSVを使用しています。

"2012-09-01 20:03:15","http://example.com"

データは私の閲覧履歴のクリーンアップされたダンプです。特定の日に最初の5つの一意のドメインを数えることに興味があります。これが私がこれまでに持っているものです：

from urlparse import urlparse
import csv
from collections import Counter

domains = Counter()

with open("history.csv") as f:
    for row in csv.reader(f):
        d = row[0]
        dt = d[11:19]
        dt = dt.replace(":","")
        dd = d[0:10]
        if (dt < "090000") and (dt > "060000"):
            url = row[1]
            p = urlparse(url)
            ph = p.hostname
            print dd + "," + dt + "," + ph
            domains += Counter([ph])
t = str(domains.most_common(20))

d、dt、およびddを使用して、日付と時刻を区切ります。上記の行の例では、dt = 20:03:15、およびdd=2012-09-01です。「if（dt <"090000"）and（dt> "060000"）」は、午前6時から午前9時の間にアクセスしたWebサイトのカウントにのみ関心があることを示しています。「毎日午前6時より前にアクセスした最初の5つのWebサイトのみを数える」とはどういう意味ですか？特定の日に数百の行があり、行は時系列になっています。

score 3 · Accepted Answer

特定の 1 日あたりの最初の 5 つの一意のドメインをカウントすることに関心があります。

import csv
from collections import defaultdict
from datetime import datetime
from urlparse import urlsplit

domains = defaultdict(lambda: defaultdict(int))
with open("history.csv", "rb") as f:
     for timestr, url in csv.reader(f):
         dt = datetime.strptime(timestr, "%Y-%m-%d %H:%M:%S")
         if 6 <= dt.hour < 9: # between 6am and 9am
            today_domains = domains[dt.date()] #  per given day
            domain = urlsplit(url).hostname
            if len(today_domains) < 5 or domain in today_domains:
               today_domains[domain] += 1 # count the first 5 unique domains

print(domains)

score 1 · Accepted Answer

import csv
from collections import defaultdict, Counter
from datetime import datetime
from urlparse import urlsplit

indiv = Counter()

domains = defaultdict(lambda: defaultdict(int))
with open("history.csv", "rb") as f:
    for timestr, url in csv.reader(f):
        dt = datetime.strptime(timestr, "%Y-%m-%d %H:%M:%S")
        if 6 <= dt.hour < 11: # between 6am and 11am
            today_domains = domains[dt.date()]
            domain = urlsplit(url).hostname
            if len(today_domains) < 5 and domain not in today_domains:
                today_domains[domain] += 1
                indiv += Counter([domain])
for domain in indiv:
    print '%s,%d' % (domain, indiv[domain])

python - PythonCSV行値ベースのフロー制御

2 に答える 2

Related

Reference