語彙の多様性と他のいくつかの意味のある統計を計算するスクリプトを作成しました。私の問題は、いくつかのファイルでは、それが悪いjson行であると私が推測できるものにぶつかると失敗することです。私の各データファイルには、それぞれ独自の行にjsonの文字列が含まれています。これらの計算で意味のあるフィールドは「テキスト」フィールドです。
私のコード:
import fileinput
import json
import sys
import os
from collections import defaultdict
line = [] # set to list
tw = 0 # set total words to int
tuw = 0 # set total unique words to int
lexd = 0 # set total lexical diversity to int
awpt = 0 # set average words per tweet to int
line_counter = 0
inputfilename = sys.argv[1] # read the first system argument as the input file name
word_count = defaultdict(int) # set word_count to the default dictionary
for line in fileinput.input([inputfilename]): # FOR each line in the input file
line = line.strip(); # strip any blank lines and throw them out
if not line: continue # if the file does contain a blank line still: in the case of EOF then continue
tweettext = json.loads(line).get('text') # load the line with json.loads and get the "text" field
if not json.loads(line).get('text'): continue # if the line does not contain json data then continue
words = tweettext.split() # split the words from the single line into individual dicts
tw += len(words) # total words counter
line_counter += 1 # total lines counter
print line_counter # so we know what line we're on
for word in words: # FOR each word in the individual line "text" corpus
word_count[word]+=1 # Take the word_count dict, insert the words and incriment
tuw = len(set(word_count)) # calculate the total number of unique words
lexd += 1.0*tuw/tw # calculate the lexical diversity
awpt = 1.0*tuw/line_counter # calc average number of words per tweet
print word_count # print the word list dictionary
print "total number of words", tw # print the total number of words
print "total uniq words", tuw # print the total number of unique words
print "total corpus lexical diversity", lexd # print the total lexical diversity of the entire corpus
print "average number of words per tweet", awpt # print the average number of words per tweet
サンプルデータ:
{"favorited": false, "in_reply_to_user_id": 213741147, "contributors": null, "truncated": false, "text": "@Rafinha_Angelo sim sim, manda o print l\u00e1 HUSAHUS!", "created_at": "Tue Feb 14 00:30:59 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": "169216950453542912", "coordinates": null, "in_reply_to_user_id_str": "213741147", "entities": {"user_mentions": [{"indices": [0, 15], "screen_name": "Rafinha_Angelo", "id": 213741147, "name": "Rafael A. Figueiredo", "id_str": "213741147"}], "hashtags": [], "urls": []}, "in_reply_to_status_id": 169216950453542912, "id_str": "169217034821976067", "in_reply_to_screen_name": "Rafinha_Angelo", "user": {"follow_request_sent": null, "profile_use_background_image": true, "profile_background_image_url_https": "https://si0.twimg.com/images/themes/theme9/bg.gif", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1769152407/223_normal.JPG", "profile_sidebar_fill_color": "252429", "is_translator": false, "id": 67115876, "profile_text_color": "666666", "followers_count": 310, "profile_sidebar_border_color": "181A1E", "location": "Somewhere.", "default_profile_image": false, "listed_count": 0, "utc_offset": -10800, "statuses_count": 6027, "description": "it's like one more day, with no more things !", "friends_count": 106, "profile_link_color": "2FC2EF", "profile_image_url": "http://a2.twimg.com/profile_images/1769152407/223_normal.JPG", "notifications": null, "show_all_inline_media": false, "geo_enabled": true, "profile_background_color": "1A1B1F", "id_str": "67115876", "profile_background_image_url": "http://a1.twimg.com/images/themes/theme9/bg.gif", "screen_name": "Guiii_Fernandes", "lang": "en", "profile_background_tile": false, "favourites_count": 112, "name": "Guilherme Fernandes", "url": "http://facebook.com/GuiiFernandes", "created_at": "Wed Aug 19 20:43:05 +0000 2009", "contributors_enabled": false, "time_zone": "Brasilia", "protected": false, "default_profile": false, "following": null}, "place": null, "retweet_count": 0, "geo": null, "id": 169217034821976067, "source": "web"}
{"favorited": false, "in_reply_to_user_id": null, "contributors": null, "retweeted_status": {"favorited": false, "in_reply_to_user_id": null, "contributors": null, "truncated": false, "text": "On the stage in Vegas for the last few days of rehearsal...this is epic! Going to be a huge show. I like! (said in Borat voice). Xoxo, JM", "created_at": "Mon Feb 13 23:27:08 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [], "hashtags": [], "urls": []}, "in_reply_to_status_id": null, "id_str": "169200965151494144", "place": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 69751644, "description": "", "verified": true, "profile_image_url_https": "https://si0.twimg.com/profile_images/387138234/1_normal.jpg", "profile_sidebar_fill_color": "5c5c5c", "is_translator": false, "geo_enabled": false, "profile_text_color": "333333", "followers_count": 473162, "profile_sidebar_border_color": "00e35f", "id_str": "69751644", "default_profile_image": false, "location": "Los Angeles", "utc_offset": -28800, "statuses_count": 5380, "profile_background_color": "00e35f", "friends_count": 10730, "profile_link_color": "05bcff", "profile_image_url": "http://a0.twimg.com/profile_images/387138234/1_normal.jpg", "notifications": null, "show_all_inline_media": false, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/72720138/green.jpg", "profile_background_image_url": "http://a0.twimg.com/profile_background_images/72720138/green.jpg", "screen_name": "jamesmaslow", "lang": "en", "profile_background_tile": false, "favourites_count": 1, "name": "james maslow", "url": "http://www.JamesMaslow.com", "created_at": "Sat Aug 29 01:32:02 +0000 2009", "contributors_enabled": false, "time_zone": "Pacific Time (US & Canada)", "protected": false, "default_profile": false, "following": null, "listed_count": 8348}, "in_reply_to_screen_name": null, "retweet_count": 465, "geo": null, "id": 169200965151494144, "source": "<a href=\"http://www.osfoora.com\" rel=\"nofollow\">Osfoora for iPhone</a>"}, "truncated": true, "text": "RT @jamesmaslow: On the stage in Vegas for the last few days of rehearsal...this is epic! Going to be a huge show. I like! (said in Bora ...", "created_at": "Tue Feb 14 00:30:59 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [{"indices": [3, 15], "id_str": "69751644", "id": 69751644, "name": "james maslow", "screen_name": "jamesmaslow"}], "hashtags": [], "urls": []}, "in_reply_to_status_id": null, "id_str": "169217034817765377", "place": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 466873377, "description": "Totally dedicate for @1LoganHenderson MINE perfect BTBoy!!!! *--* Rusher for the infinity and beyond and much more beyond!!! Since 01/17/12 =*", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1781262618/_20120126_023206_424_normal.gif", "profile_sidebar_fill_color": "940a2d", "is_translator": false, "geo_enabled": false, "profile_text_color": "eb4466", "followers_count": 103, "profile_sidebar_border_color": "d61153", "id_str": "466873377", "default_profile_image": false, "location": "", "utc_offset": -7200, "statuses_count": 3730, "profile_background_color": "070808", "friends_count": 154, "profile_link_color": "de243d", "profile_image_url": "http://a2.twimg.com/profile_images/1781262618/_20120126_023206_424_normal.gif", "notifications": null, "show_all_inline_media": false, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/421884815/tumblr_lz7316OE041rnvmm7o1_500.jpg", "profile_background_image_url": "http://a3.twimg.com/profile_background_images/421884815/tumblr_lz7316OE041rnvmm7o1_500.jpg", "screen_name": "Logiehbear", "lang": "en", "profile_background_tile": true, "favourites_count": 209, "name": "BBFFF da Laryh!!", "url": null, "created_at": "Tue Jan 17 21:53:17 +0000 2012", "contributors_enabled": false, "time_zone": "Mid-Atlantic", "protected": false, "default_profile": false, "following": null, "listed_count": 1}, "in_reply_to_screen_name": null, "retweet_count": 465, "geo": null, "id": 169217034817765377, "source": "web"}
スクリプト出力:
1
2
defaultdict(<type 'int'>, {u'be': 1, u'is': 1, u'Going': 1, u'in': 2, u'I': 1, u'(said': 1, u'RT': 1, u'huge': 1, u'for': 1, u'l\xe1': 1, u'few': 1, u'Vegas': 1, u'manda': 1, u'print': 1, u'sim,': 1, u'sim': 1, u'On': 1, u'to': 1, u'like!': 1, u'HUSAHUS!': 1, u'rehearsal...this': 1, u'@jamesmaslow:': 1, u'...': 1, u'epic!': 1, u'stage': 1, u'a': 1, u'show.': 1, u'last': 1, u'of': 1, u'days': 1, u'o': 1, u'@Rafinha_Angelo': 1, u'the': 2, u'Bora': 1})
total number of words 36
total uniq words 34
total corpus lexical diversity 0.944444444444
average number of words per tweet 17.0
これは実際には非常に高速に実行されますが、数千行後の一部のデータセットでは、次のように失敗します。
Traceback (most recent call last):
File "lex.py", line 21, in <module>
tweettext = json.loads(line).get('text') # load the line with json.loads and get the "text" field
File "/usr/lib64/python2.7/json/__init__.py", line 326, in loads
return _default_decoder.decode(s)
File "/usr/lib64/python2.7/json/decoder.py", line 366, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/lib64/python2.7/json/decoder.py", line 382, in raw_decode
obj, end = self.scan_once(s, idx)
ValueError: Unterminated string starting at: line 1 column 1531 (char 1531)
スクリプトが読み取っている行の形式でハングアップしている場合は、その行をスキップして次に進みます。助言がありますか?