0

語彙の多様性と他のいくつかの意味のある統計を計算するスクリプトを作成しました。私の問題は、いくつかのファイルでは、それが悪いjson行であると私が推測できるものにぶつかると失敗することです。私の各データファイルには、それぞれ独自の行にjsonの文字列が含まれています。これらの計算で意味のあるフィールドは「テキスト」フィールドです。

私のコード:

import fileinput
import json
import sys
import os
from collections import defaultdict

line = []                                                       # set to list
tw = 0                                                          # set total words to int
tuw = 0                                                         # set total unique words to int
lexd = 0                                                        # set total lexical diversity to int
awpt = 0                                                        # set average words per tweet to int
line_counter = 0

inputfilename = sys.argv[1]                                     # read the first system argument as the input file name

word_count = defaultdict(int)                                   # set word_count to the default dictionary

for line in fileinput.input([inputfilename]):                   # FOR each line in the input file
        line = line.strip();                                            # strip any blank lines and throw them out
        if not line: continue                                           # if the file does contain a blank line still: in the case of EOF then continue
        tweettext = json.loads(line).get('text')                        # load the line with json.loads and get the "text" field
        if not json.loads(line).get('text'): continue                   # if the line does not contain json data then continue
        words = tweettext.split()                                       # split the words from the single line into individual dicts
        tw += len(words)                                                # total words counter
        line_counter += 1                                               # total lines counter
        print line_counter                                              # so we know what line we're on
        for word in words:                                              # FOR each word in the individual line "text" corpus
                word_count[word]+=1                                             # Take the word_count dict, insert the words and incriment

tuw = len(set(word_count))                                      # calculate the total number of unique words
lexd += 1.0*tuw/tw                                              # calculate the lexical diversity
awpt = 1.0*tuw/line_counter                                     # calc average number of words per tweet

print word_count                                                # print the word list dictionary
print "total number of words", tw                               # print the total number of words
print "total uniq words", tuw                                   # print the total number of unique words
print "total corpus lexical diversity", lexd                    # print the total lexical diversity of the entire corpus
print "average number of words per tweet", awpt                 # print the average number of words per tweet

サンプルデータ:

{"favorited": false, "in_reply_to_user_id": 213741147, "contributors": null, "truncated": false, "text": "@Rafinha_Angelo sim sim, manda o print l\u00e1 HUSAHUS!", "created_at": "Tue Feb 14 00:30:59 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": "169216950453542912", "coordinates": null, "in_reply_to_user_id_str": "213741147", "entities": {"user_mentions": [{"indices": [0, 15], "screen_name": "Rafinha_Angelo", "id": 213741147, "name": "Rafael A. Figueiredo", "id_str": "213741147"}], "hashtags": [], "urls": []}, "in_reply_to_status_id": 169216950453542912, "id_str": "169217034821976067", "in_reply_to_screen_name": "Rafinha_Angelo", "user": {"follow_request_sent": null, "profile_use_background_image": true, "profile_background_image_url_https": "https://si0.twimg.com/images/themes/theme9/bg.gif", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1769152407/223_normal.JPG", "profile_sidebar_fill_color": "252429", "is_translator": false, "id": 67115876, "profile_text_color": "666666", "followers_count": 310, "profile_sidebar_border_color": "181A1E", "location": "Somewhere.", "default_profile_image": false, "listed_count": 0, "utc_offset": -10800, "statuses_count": 6027, "description": "it's like one more day, with no more things !", "friends_count": 106, "profile_link_color": "2FC2EF", "profile_image_url": "http://a2.twimg.com/profile_images/1769152407/223_normal.JPG", "notifications": null, "show_all_inline_media": false, "geo_enabled": true, "profile_background_color": "1A1B1F", "id_str": "67115876", "profile_background_image_url": "http://a1.twimg.com/images/themes/theme9/bg.gif", "screen_name": "Guiii_Fernandes", "lang": "en", "profile_background_tile": false, "favourites_count": 112, "name": "Guilherme Fernandes", "url": "http://facebook.com/GuiiFernandes", "created_at": "Wed Aug 19 20:43:05 +0000 2009", "contributors_enabled": false, "time_zone": "Brasilia", "protected": false, "default_profile": false, "following": null}, "place": null, "retweet_count": 0, "geo": null, "id": 169217034821976067, "source": "web"}
{"favorited": false, "in_reply_to_user_id": null, "contributors": null, "retweeted_status": {"favorited": false, "in_reply_to_user_id": null, "contributors": null, "truncated": false, "text": "On the stage in Vegas for the last few days of rehearsal...this is epic! Going to be a huge show. I like! (said in Borat voice). Xoxo, JM", "created_at": "Mon Feb 13 23:27:08 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [], "hashtags": [], "urls": []}, "in_reply_to_status_id": null, "id_str": "169200965151494144", "place": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 69751644, "description": "", "verified": true, "profile_image_url_https": "https://si0.twimg.com/profile_images/387138234/1_normal.jpg", "profile_sidebar_fill_color": "5c5c5c", "is_translator": false, "geo_enabled": false, "profile_text_color": "333333", "followers_count": 473162, "profile_sidebar_border_color": "00e35f", "id_str": "69751644", "default_profile_image": false, "location": "Los Angeles", "utc_offset": -28800, "statuses_count": 5380, "profile_background_color": "00e35f", "friends_count": 10730, "profile_link_color": "05bcff", "profile_image_url": "http://a0.twimg.com/profile_images/387138234/1_normal.jpg", "notifications": null, "show_all_inline_media": false, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/72720138/green.jpg", "profile_background_image_url": "http://a0.twimg.com/profile_background_images/72720138/green.jpg", "screen_name": "jamesmaslow", "lang": "en", "profile_background_tile": false, "favourites_count": 1, "name": "james maslow", "url": "http://www.JamesMaslow.com", "created_at": "Sat Aug 29 01:32:02 +0000 2009", "contributors_enabled": false, "time_zone": "Pacific Time (US & Canada)", "protected": false, "default_profile": false, "following": null, "listed_count": 8348}, "in_reply_to_screen_name": null, "retweet_count": 465, "geo": null, "id": 169200965151494144, "source": "<a href=\"http://www.osfoora.com\" rel=\"nofollow\">Osfoora for iPhone</a>"}, "truncated": true, "text": "RT @jamesmaslow: On the stage in Vegas for the last few days of rehearsal...this is epic! Going to be a huge show. I like! (said in Bora ...", "created_at": "Tue Feb 14 00:30:59 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [{"indices": [3, 15], "id_str": "69751644", "id": 69751644, "name": "james maslow", "screen_name": "jamesmaslow"}], "hashtags": [], "urls": []}, "in_reply_to_status_id": null, "id_str": "169217034817765377", "place": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 466873377, "description": "Totally dedicate for @1LoganHenderson MINE perfect BTBoy!!!! *--* Rusher for the infinity and beyond and much more beyond!!! Since 01/17/12 =*", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1781262618/_20120126_023206_424_normal.gif", "profile_sidebar_fill_color": "940a2d", "is_translator": false, "geo_enabled": false, "profile_text_color": "eb4466", "followers_count": 103, "profile_sidebar_border_color": "d61153", "id_str": "466873377", "default_profile_image": false, "location": "", "utc_offset": -7200, "statuses_count": 3730, "profile_background_color": "070808", "friends_count": 154, "profile_link_color": "de243d", "profile_image_url": "http://a2.twimg.com/profile_images/1781262618/_20120126_023206_424_normal.gif", "notifications": null, "show_all_inline_media": false, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/421884815/tumblr_lz7316OE041rnvmm7o1_500.jpg", "profile_background_image_url": "http://a3.twimg.com/profile_background_images/421884815/tumblr_lz7316OE041rnvmm7o1_500.jpg", "screen_name": "Logiehbear", "lang": "en", "profile_background_tile": true, "favourites_count": 209, "name": "BBFFF da Laryh!!", "url": null, "created_at": "Tue Jan 17 21:53:17 +0000 2012", "contributors_enabled": false, "time_zone": "Mid-Atlantic", "protected": false, "default_profile": false, "following": null, "listed_count": 1}, "in_reply_to_screen_name": null, "retweet_count": 465, "geo": null, "id": 169217034817765377, "source": "web"}

スクリプト出力:

1
2
defaultdict(<type 'int'>, {u'be': 1, u'is': 1, u'Going': 1, u'in': 2, u'I': 1, u'(said': 1, u'RT': 1, u'huge': 1, u'for': 1, u'l\xe1': 1, u'few': 1, u'Vegas': 1, u'manda': 1, u'print': 1, u'sim,': 1, u'sim': 1, u'On': 1, u'to': 1, u'like!': 1, u'HUSAHUS!': 1, u'rehearsal...this': 1, u'@jamesmaslow:': 1, u'...': 1, u'epic!': 1, u'stage': 1, u'a': 1, u'show.': 1, u'last': 1, u'of': 1, u'days': 1, u'o': 1, u'@Rafinha_Angelo': 1, u'the': 2, u'Bora': 1})
total number of words 36
total uniq words 34
total corpus lexical diversity 0.944444444444
average number of words per tweet 17.0

これは実際には非常に高速に実行されますが、数千行後の一部のデータセットでは、次のように失敗します。

Traceback (most recent call last):
  File "lex.py", line 21, in <module>
    tweettext = json.loads(line).get('text')                        # load the line with json.loads and get the "text" field
  File "/usr/lib64/python2.7/json/__init__.py", line 326, in loads
    return _default_decoder.decode(s)
  File "/usr/lib64/python2.7/json/decoder.py", line 366, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/usr/lib64/python2.7/json/decoder.py", line 382, in raw_decode
    obj, end = self.scan_once(s, idx)
ValueError: Unterminated string starting at: line 1 column 1531 (char 1531)

スクリプトが読み取っている行の形式でハングアップしている場合は、その行をスキップして次に進みます。助言がありますか?

4

1 に答える 1

1

json.loads エラーが発生した場合に例外を含める Jesse Harris のソリューションを使用して、この問題を解決しました。

import fileinput
import json
import sys
import os
from collections import defaultdict

line = []                                                     
tw = 0                                                        
tuw = 0                                                        
lexd = 0                                                       
awpt = 0                                                       
line_counter = 0

inputfilename = sys.argv[1]                                   

word_count = defaultdict(int)                                 

for line in fileinput.input([inputfilename]):                  
        line = line.strip();                                            
        if not line: continue   
        try:         
               tweettext = json.loads(line).get('text')                       
               if not json.loads(line).get('text'): continue                   
               words = tweettext.split()                                       
               tw += len(words)                                                
               line_counter += 1                                               
               print line_counter                                              
               for word in words:                                              
                      word_count[word]+=1                                             
        except:
               print "Problem Line: " + line

tuw = len(set(word_count))                                      
lexd += 1.0*tuw/tw                                             
awpt = 1.0*tuw/line_counter                                     

# print word_count                                               
print "total number of words", tw                              
print "total uniq words", tuw                                  
print "total corpus lexical diversity", lexd                   
print "average number of words per tweet", awpt                

これを自分のデータに対して実行すると、gzip された行のデータが出力されました。以前のコメントで述べたように、これは Twitter から gzip ストリーミング API に切り替えたためです。@jesseharrisに2つの親指

于 2012-06-03T05:52:05.747 に答える