以下のパターンを使用して文字列をトークン化しようとしています。
>>> splitter = re.compile(r'((\w*)(\d*)\-\s?(\w*)(\d*)|(?x)\$?\d+(\.\d+)?(\,\d+)?|([A-Z]\.)+|(Mr)\.|(Sen)\.|(Miss)\.|.$|\w+|[^\w\s])')
>>> splitter.split("Hello! Hi, I am debating this predicament called life. Can you help me?")
次の出力が得られます。誰かが私が修正する必要があることを指摘してもらえますか?私は「なし」の束全体について混乱しています。また、文字列をトークン化するためのより良い方法がある場合は、追加のヘルプを本当にいただければ幸いです。
['', 'Hello', None, None, None, None, None, None, None, None, None, None, '', '!', None, None, None, None, None, None, None, None, None, None, ' ', 'Hi', None,None, None, None, None, None, None, None, None, None, '', ',', None, None, None, None, None, None, None, None, None, None, ' ', 'I', None, None, None, None, None, None, None, None, None, None, ' ', 'am', None, None, None, None, None, None,None, None, None, None, ' ', 'debating', None, None, None, None, None, None, None, None, None, None, ' ', 'this', None, None, None, None, None, None, None, None, None, None, ' ', 'predicament', None, None, None, None, None, None, None, None, None, None, ' ', 'called', None, None, None, None, None, None, None, None, None, None, ' ', 'life', None, None, None, None, None, None, None, None, None, None, '', '.', None, None, None, None, None, None, None, None, None, None, ' ', 'Can', None, None, None, None, None, None, None, None, None, None, ' ', 'you', None, None, None, None, None, None, None, None, None, None, ' ', 'help', None, None,None, None, None, None, None, None, None, None, ' ', 'me', None, None, None, None, None, None, None, None, None, None, '', '?', None, None, None, None, None, None, None, None, None, None, '']
私が欲しい出力は次のとおりです:-
['Hello', '!', 'Hi', ',', 'I', 'am', 'debating', 'this', 'predicament', 'called', 'life', '.', 'Can', 'you', 'help', 'me', '?']
ありがとうございました。