重複の可能性:
Python での CSV ファイルのフィルタリング
Python で記述された CSV パーサーがあり、500 行目あたりで失敗し始めます。^(\w+)\*(\d\d):(\d\d):(\d\d)$
正規表現の解析に失敗し始めます。
import csv
import re
import sys
csvdictreader = csv.DictReader(open('mhc.csv','r+b'), delimiter=',')
csvdictwriter = csv.DictWriter(file('mhc_fixed.csv','w+b'), fieldnames=csvdictreader.fieldnames, delimiter=',')
csvdictwriter.writeheader()
targets = [name for name in csvdictreader.fieldnames if name.startswith('HLA-D')]
for rowfields in csvdictreader:
keep = True
for field in targets:
value = rowfields[field]
if re.match(r'^\w+\*\d\d$', value): # gene resolution too low?
keep = False
break # quit processing target fields
else: # reduce gene resolution if too high
# by only keeping first two alles if three are present
if (re.match(r'^(\w+)\*(\d\d):(\d\d):(\d\d)$')): rowfields[field] = re.sub(r'^(\w+)\*(\d\d):(\d\d):(\d\d)$',r'\1*\2:\3', value)
if (re.match(r'^(\w+)\*(\d+):(\d+):(\d+):(\d+):(\d+)$')): rowfields[field] = re.sub(r'^(\w+)\*(\d+):(\d+):(\d+):(\d+)$',r'\1*\2:\3', value)
if keep:
csvdictwriter.writerow(rowfields)
if rowfields > 1400:
print >>sys.stderr