python - カスタムフィールドパターンによるPython解析ファイル名

Question

今週から Python の学習を始めたので、Excel ではなく Python を使用して、ファイルパスからいくつかのフィールドを解析しようと考えました。

命名規則にすべて適合する約 3000 個のファイルがあります。/Household/LastName.FirstName.Account.Doctype.Date.extension

たとえば、これらのファイルの 1 つに Cosby.Bill..Profile.2006.doc という名前が付けられ、フルパスは /Volumes/HD/Organized Files/Cosby, Bill/Cosby.Bill..Profile.2006.doc です。

この場合：

コスビー、ビルは世帯だろう

世帯 (Cosby、Bill) は、実際のファイルを囲むフォルダーです。

ビルがファーストネームだろう

コスビーが姓になる

Account フィールドが省略されています

プロファイルはDoctypeです

日付は2006年

docは拡張子です

これらのファイルはすべて、このディレクトリ /Volumes/HD/Organized Files/ にあります。ターミナルと ls を使用して、すべてのファイルのリストをデスクトップの .txt ファイルに取得し、ファイルパスからの情報を解析して上記のサンプルのようなカテゴリ。理想的には、各カテゴリの列を使用して csv に出力したいと考えています。ここに私の醜いコードがあります：

def main():
    file = open('~/Desktop/client_docs.csv', "rb")
    output = open('~/Desktop/client_docs_parsed.txt', "wb")
    for line in file:
        i = line.find(find_nth(line, '/', 2))
        beghouse = line[i + len(find_nth(line, '/', 2)):]
        endhouse = beghouse.find('/')
        household = beghouse[:endhouse]
        lastn = (line[line.find(household):])[(line[line.find(household):]).find('/') + 1:(line[line.find(household):]).find('.')]
        firstn = line[line.find('.') + 1: line.find('.', line.find('.') + 1)]
        acct = line[line.find('{}.{}.'.format(lastn,firstn)) + len('{}.{}.'.format(lastn,firstn)):line.find('.',line.find('{}.{}.'.format(lastn,firstn)) + len('{}.{}.'.format(lastn,firstn)))]
        doctype_beg = line[line.find('{}.{}.{}.'.format(lastn, firstn, acct)) + len('{}.{}.{}.'.format(lastn, firstn, acct)):]
        doctype = doctype_beg[:doctype_beg.find('.')]
        date_beg = line[line.find('{}/{}.{}.{}.{}.'.format(household,lastn,firstn,acct,doctype)) + len('{}/{}.{}.{}.{}.'.format(household,lastn,firstn,acct,doctype)):]
        date = date_beg[:date_beg.find('.')]
        print '"',household, '"','"',lastn, '"','"',firstn, '"','"',acct, '"','"',doctype, '"','"',date,'"'

def find_nth(body, s_term, n):
    start = body[::-1].find(s_term)
    while start >= 0 and n > 1:
        start = body[::-1].find(s_term, start+len(s_term))
        n -= 1
    return ((body[::-1])[start:])[::-1]

if __name__ == "__main__": main()

正常に動作しているようですが、別の囲みフォルダーがあると問題が発生し、すべてのフィールドがシフトします..たとえば、ファイルが存在するのではなく

/Volumes/HD/Organized Files/コスビー、ビル/

/Volumes/HD/Organized Files/Resigned/Cosby, Bill/ にあります。

私は、これを行うには、より不格好な方法が必要であることを知っています.

score 1 · Accepted Answer

これがあなたの関数よりも実用的なツールですfind_nth()：
rstrip()

def find_nth(body, s_term, n):
    start = body[::-1].find(s_term)
    print '------------------------------------------------'
    print 'body[::-1]\n',body[::-1]
    print '\nstart == %s' % start
    while start >= 0 and n > 1:
        start = body[::-1].find(s_term, start+len(s_term))
        print 'n == %s    start == %s' % (n,start)
        n -= 1
    print '\n (body[::-1])[start:]\n',(body[::-1])[start:]
    print '\n((body[::-1])[start:])[::-1]\n',((body[::-1])[start:])[::-1]
    print '---------------\n'
    return ((body[::-1])[start:])[::-1]


def cool_find_nth(body, s_term, n):
    assert(len(s_term)==1)
    return body.rsplit(s_term,n)[0] + s_term


ss = 'One / Two / Three / Four / Five / Six / End'
print 'the string\n%s\n' % ss

print ('================================\n'
       "find_nth(ss, '/', 3)\n%s" % find_nth(ss, '/', 3) )

print '================================='
print "cool_find_nth(ss, '/', 3)\n%s" % cool_find_nth(ss, '/', 3)

結果

the string
One / Two / Three / Four / Five / Six / End

------------------------------------------------
body[::-1]
dnE / xiS / eviF / ruoF / eerhT / owT / enO

start == 4
n == 3    start == 10
n == 2    start == 17

 (body[::-1])[start:]
/ ruoF / eerhT / owT / enO

((body[::-1])[start:])[::-1]
One / Two / Three / Four /
---------------

================================
find_nth(ss, '/', 3)
One / Two / Three / Four /
=================================
cool_find_nth(ss, '/', 3)
One / Two / Three / Four /

編集1

別の非常に実用的なツールがあります：正規表現

import re

reg = re.compile('/'
                 '([^/.]*?)/'
                 '([^/.]*?)\.'
                 '([^/.]*?)\.'
                 '([^/.]*?)\.'
                 '([^/.]*?)\.'
                 '([^/.]*?)\.'
                 '[^/.]+\Z')

def main():
    #file = open('~/Desktop/client_docs.csv', "rb")
    #output = open('~/Desktop/client_docs_parsed.txt', "wb")
    li = ['/Household/LastName.FirstName.Account.Doctype.Date.extension',
          '- /Volumes/HD/Organized Files/Cosby, Bill/Cosby.Bill..Profile.2006.doc']
    for line in li:
        print "line == %r" % line
        household,lastn,firstn,acct,doctype,date = reg.search(line).groups('')       
        print ('household == %r\n'
               'lastn     == %r\n'
               'firstn    == %r\n'
               'acct      == %r\n'
               'doctype   == %r\n'
               'date      == %r\n'
               % (household,lastn,firstn,acct,doctype,date))

if __name__ == "__main__": main()

結果

line == '/Household/LastName.FirstName.Account.Doctype.Date.extension'
household == 'Household'
lastn     == 'LastName'
firstn    == 'FirstName'
acct      == 'Account'
doctype   == 'Doctype'
date      == 'Date'

line == '- /Volumes/HD/Organized Files/Cosby, Bill/Cosby.Bill..Profile.2006.doc'
household == 'Cosby, Bill'
lastn     == 'Cosby'
firstn    == 'Bill'
acct      == ''
doctype   == 'Profile'
date      == '2006'

編集2

最後の編集を投稿したとき、私の脳はどこにあったのだろうか。以下も同様に機能します。

rig = re.compile('[/.]')
rig.split(line)[-7:-1]

score 1 · Accepted Answer

私が収集できることから、これは以前にコンパイルされたファイルのリストに依存しない解決策として機能すると思います

import csv
import os, os.path

# Replace this with the directory where the household directories are stored.
directory = "home"
output = open("Output.csv", "wb")
csvf = csv.writer(output)

headerRow = ["Household", "Lastname", "Firstname", "Account", "Doctype", 
              "Date", "Extension"]

csvf.writerow(headerRow)

for root, households, files in os.walk(directory):
    for household in households:
        for filename in os.listdir(os.path.join(directory, household)):
            # This will create a record for each filename within the "household"
            # Then will split the filename out, using the "." as a delimiter
            # to get the detail
            csvf.writerow([household] + filename.split("."))
output.flush()
output.close()

これは、osライブラリを使用して世帯のリストを「ウォーク」します。次に、「世帯」ごとに、ファイルリストを収集します。これは、このリストを使用して、csvファイルにレコードを生成し、ピリオドを区切り文字として使用して、ファイルの名前を分解します。

csvライブラリを使用して出力を生成します。出力は次のようになります。

"Household,LastName,Firstname,Account,Doctype,Date,Extension"

拡張子が不要な場合は、次の行を変更することで省略できます。

csvf.writerow([household] + filename.split("."))

に

csvf.writerow([household] + filename.split(".")[-1])

これは、ファイル名の最後の部分までしか使用しないように指示し、headerRowから「Extension」文字列を削除します。

うまくいけば、これが役立つ

score 0 · Accepted Answer

質問が何であるかは少し不明確ですが、それまでの間、ここにあなたが始めるための何かがあります:

#!/usr/bin/env python

import os
import csv

with open("f1", "rb") as fin:
    reader = csv.reader(fin, delimiter='.')
    for row in reader:
        # split path
        row = list(os.path.split(row[0])) + row[1:]
        print ','.join(row)

出力：

/Household,LastName,FirstName,Account,Doctype,Date,extension

別の解釈は、各フィールドをパラメーターに格納したいということであり、追加のパスが問題を台無しにするというものです...

for ループではrow次のようになります。

['/Household/LastName', 'FirstName', 'Account', 'Doctype', 'Date', 'extension']

解決策は、逆方向に作業することです。

、などrow[-1]に割り当てます。extensionrow[-2]date

python - カスタムフィールドパターンによるPython解析ファイル名

3 に答える 3

編集1

編集2

Related

Reference