python - Pythonを使用してXMLファイルを解析する

Question

私はグーグルの特許リストからデータをダウンロードして解析するためにすでに書かれたPythonモジュールを持っています。2005年より前に何かをするまで、コードはうまく機能します。モジュールの実行方法以外はPythonの知識がありません。どうすれば修正できますか？

私が受け取るトレースバックは次のとおりです。

Traceback (most recent call last): 
  File "C:\Users\John\Desktop\FINAL BART ALL INFO-Magic Bullet.py", line 46, in <module> 
    assert xml_file is not None
AssertionError

そして、これは私が使用しているコードです：

#Ignore all this information 
import urllib2, os, zipfile
from lxml import etree
#-------------------------------------------------------------------------------
#Ignore all this information 
def xmlSplitter(data,separator=lambda x: x.startswith('<?xml')):
  buff = []
  for line in data:
    if separator(line):
      if buff:
        yield ''.join(buff)
        buff[:] = []
    buff.append(line)
  yield ''.join(buff)

def first(seq,default=None):
  """Return the first item from sequence, seq or the default(None) value"""
  for item in seq:
    return item
  return default
#-------------------------------------------------------------------------------
#This is where you change the internet source file- Use the file extensions from the sheet provided.
datasrc = "http://storage.googleapis.com/patents/grant_full_text/2003/pg030107.zip"
#http://commondatastorage.googleapis.com/patents/grant_full_text/2012/ipg120117.zip
filename = datasrc.split('/')[-1]
#-------------------------------------------------------------------------------
#Ignore all this information 
if not os.path.exists(filename):
  with open(filename,'wb') as file_write:
    r = urllib2.urlopen(datasrc)
    file_write.write(r.read())

zf = zipfile.ZipFile(filename)
xml_file = first([ x for x in zf.namelist() if x.endswith('.xml')])
assert xml_file is not None
#-------------------------------------------------------------------------------
#output set your folder location here, keep double \\ between
outFolder = "C:\\PatentFiles\\"
outFilename = os.path.splitext(filename)[0]
#-------------------------------------------------------------------------------
#These outputs are the names of the files-Ignore all this information 
output = outFolder + outFilename + "_general.txt"
output2 = outFolder + outFilename + "_USCL.txt"
output3 = outFolder + outFilename + "_citation.txt"
output4 = outFolder + outFilename + "_inventor.txt"
#Open files
outFile = open(output, "w")
outFile2 = open(output2, "w")
outFile3 = open(output3, "w")
outFile4 = open(output4, "w")
#write the headers
outFile.write("Patent No.|GrantDate|Application Date|Number of Claims|Examiners|US Primary Main Classification|Assignee|Assignee Address City_State_Country|First Inventor|First Inventor Address City_State_Country| \n")
outFile2.write("Patent No.|Primary|U.S Classification| \n")
outFile3.write ("Patent No.|Citation|Citation Date|Who Cited This| \n")
outFile4.write ("Patent No.|Inventor Last Name|First Name|City|State|Country|Nationality Country|Residence Country|\n")
#-------------------------------------------------------------------------------
#Here is the count- adjust this each time you run the program for the first time.
#Run at 10 for the 1st run then 5500 afterward.
count = 0
for item in xmlSplitter(zf.open(xml_file)):
  count += 1
  #5500
  if count > 10: break  
  doc = etree.XML(item)
  #-------------------------------------------------------------------------------
  #This is where the python starts parsing the infomation.
  #This is the Start of the General Infomation file.
  docID = "~".join(doc.xpath('//publication-reference/document-id/country/text()|//publication-reference/document-id/doc-number/text()'))
  docID = docID.replace("D0","D") 
  docID = docID.replace("H000","H")
  docID = docID.replace("PP0","PP")
  docID = docID.replace("PP0","PP")
  docID = docID.replace("RE0","RE")
  docID = docID.replace("~0","~")
  docID = docID.replace("US~","")

  grantdate = first(doc.xpath('//publication-reference/document-id/date/text()'))
  applicationdate = first(doc.xpath('//application-reference/document-id/date/text()'))
  claimsNum = first(doc.xpath('//number-of-claims/text()'))

  assignee1 = "-".join(doc.xpath('//assignees/assignee/addressbook/orgname/text()|//assignees/assignee/addressbook/last-name/text()|//assignees/assignee/addressbook/first-name/text()'))
  assignee1 = assignee1.replace('-',', ')
  assignee2 = "_".join(doc.xpath('//assignee/addressbook/address/*/text()'))
  assignees = str(assignee1.encode("UTF-8")) + "|" + str(assignee2.encode("UTF-8"))  

  inventors1 = first(doc.xpath('//applicants/applicant/addressbook/last-name/text()'))
  inventor2 = first(doc.xpath('//applicants/applicant/addressbook/first-name/text()'))
  inventor3 = first(doc.xpath('//applicants/applicant/addressbook/address/city/text()'))
  inventor4 = first(doc.xpath('//applicants/applicant/addressbook/address/state/text()'))
  inventor5 = first(doc.xpath('//applicants/applicant/addressbook/address/country/text()'))
  inventor = str(inventor2.encode("UTF-8") if inventor2 else inventor2) + " " + str(inventors1.encode("UTF-8") if inventors1 else inventors1)
  inventors2 = str(inventor3.encode("UTF-8") if inventor3 else inventor3) + "_" + str(inventor4) + "_" + str(inventor5)
  inventors = str(inventor) + "|" + str(inventors2)

  examiners = "~".join(doc.xpath('//examiners/primary-examiner/first-name/text()|//examiners/primary-examiner/last-name/text()'))
  examiners = examiners.replace("~",", ")

  uscl1 = first(doc.xpath('//classification-national/main-classification/text()'))

  #END FIRST TEXT FILE #-------------------------------------------------------------------------------
  #This begings the USCL file
  notprimary = first(doc.xpath('//publication-reference/document-id/country/text()'))
  notprimary = notprimary.replace("US","0")

  primary1 = first(doc.xpath('//publication-reference/document-id/country/text()'))
  primary1 = primary1.replace("US","1")

  uscl2 = "~".join(doc.xpath('//us-bibliographic-data-grant/classification-national/*/text()|//sequence-cwu/publication-reference/document-id/country/text()'))
  #-------------------------NOTE--------------------------------------------------
  #--------------------------NOTE-------------------------------------------------
  #-----------------------NOTE----------------------------------------------------
  #NOTE- RUN through count 10 then remove pound signs from two below
  uscl2 = uscl2.replace("US~", str(primary1) + "|")
  uscl2 = uscl2.replace("~", "|" + "\n" + str(docID) + "|" + str(notprimary) + "|")
  uscl2 = uscl2.replace("US", "|") 

  #END SECOND TEXT FILE #-------------------------------------------------------------------------------
  #Begin the Citation file
  citation = '~'.join(doc.xpath('//publication-reference/document-id/country/text()|//references-cited/citation/patcit/document-id/country/text()|//references-cited/citation/patcit/document-id/doc-number/text()|//references-cited/citation/patcit/document-id/kind/text()|//references-cited/citation/patcit/document-id/date/text()|//references-cited/citation/category/text()'))

  #Here is the start of the patent connectors- in the patents they exist at the end. They are replaced in this code to make pipes | for the final output
  citation = citation.replace("~A~", "$@")
  citation = citation.replace("~S~", "$@")
  citation = citation.replace("~S1~", "$@")
  citation = citation.replace("~B1~", "$@")
  citation = citation.replace("~B2~", "$@")
  citation = citation.replace("~A1~", "$@")
  citation = citation.replace("~H~", "$@")
  citation = citation.replace("~E~", "$@")


  #citation = citation.replace("~QQ~", "$@")

  #make unique citation changes here-for example when "US" or "DE" in imbeded in citation see below
  citation = citation.replace("05225US~", "05225U$|" )
  citation = citation.replace("063106 DE", "063106D!" )
  citation = citation.replace("US~US~", "US~" )
  citation = citation.replace("PCT/US", "PCT/U$")
  citation = citation.replace("PCTUS", "PCTU$")
  citation = citation.replace("WO US", "WO U$")
  citation = citation.replace("WO~US", "WO~ U$")

  #fixes for cites without pipes-see below -DONT TOUCH THESE
  citation = citation.replace("US~cited by examiner", "||cited by examiner" )
  citation = citation.replace("US~cited by other", "||cited by other" )


  #Here are the changes to return each citation into a unique row
  #If a country is only listed in the columns in Excel they need a fix like this, If KR is alone then use the code:::: citation = citation.replace("KR~", "Foreign -KR-" )
  citation = citation.replace("$@", "|")
  citation = citation.replace("~US~", "|" + "\n" + str(docID) +"|")
  citation = citation.replace("US~", "")
  citation = citation.replace("~JP~", "|" + "\n" + str(docID) +"|"+ "Foreign -JP-")
  citation = citation.replace("JP~", "Foreign -JP-" )
  citation = citation.replace("~GB~", "|" + "\n" + str(docID) +"|"+ "Foreign -GB-")
  citation = citation.replace("GB~", "Foreign -GB-" )
  citation = citation.replace("~WO~", "|" + "\n" + str(docID) +"|"+ "Foreign -WO-")
  citation = citation.replace("WO~", "Foreign -WO-" )
  citation = citation.replace("~CA~", "|" + "\n" + str(docID) +"|"+ "Foreign -CA-")
  citation = citation.replace("~DE~EP~", "~DE~ EP-" )
  citation = citation.replace("~DE~", "|" + "\n" + str(docID) +"|"+ "Foreign -DE-")
  citation = citation.replace("DE~", "Foreign -DE-" )
  citation = citation.replace("~KR~", "|" + "\n" + str(docID) +"|"+ "Foreign -KR-")
  citation = citation.replace("KR~", "Foreign -KR-" )
  citation = citation.replace("~EM~", "|" + "\n" + str(docID) +"|"+ "Foreign -EM-")
  citation = citation.replace("~CH~", "|" + "\n" + str(docID) +"|"+ "Foreign -CH-")
  citation = citation.replace("~DE~", "|" + "\n" + str(docID) +"|"+ "Foreign -DE-")
  citation = citation.replace("~SE~", "|" + "\n" + str(docID) +"|"+ "Foreign -SE-")
  citation = citation.replace("~FR~", "|" + "\n" + str(docID) +"|"+ "Foreign -FR-")
  citation = citation.replace("~FR~EP~", "~FR~ EP-" )
  citation = citation.replace("FR~", "Foreign -FR-" )
  citation = citation.replace("~CN~", "|" + "\n" + str(docID) +"|"+ "Foreign -CN-")
  citation = citation.replace("~TW~", "|" + "\n" + str(docID) +"|"+ "Foreign -TW-")
  citation = citation.replace("~TW", "|" + "\n" + str(docID) +"|"+ "Foreign -TW-")
  citation = citation.replace("TW~", "Foreign -TW-" )
  citation = citation.replace("~NL~", "|" + "\n" + str(docID) +"|"+ "Foreign -NL-")
  citation = citation.replace("~BR~", "|" + "\n" + str(docID) +"|"+ "Foreign -BR-")
  citation = citation.replace("~AU~", "|" + "\n" + str(docID) +"|"+ "Foreign -AU-")
  citation = citation.replace("~ES~", "|" + "\n" + str(docID) +"|"+ "Foreign -ES-")
  citation = citation.replace("~IT~", "|" + "\n" + str(docID) +"|"+ "Foreign -IT-")
  citation = citation.replace("~SU~", "|" + "\n" + str(docID) +"|"+ "Foreign -SU-")
  citation = citation.replace("~AT~", "|" + "\n" + str(docID) +"|"+ "Foreign -AT-")
  citation = citation.replace("~BE~", "|" + "\n" + str(docID) +"|"+ "Foreign -BE-")
  citation = citation.replace("~DK~", "|" + "\n" + str(docID) +"|"+ "Foreign -DK-")
  citation = citation.replace("~RU~", "|" + "\n" + str(docID) +"|"+ "Foreign -RU-")
  citation = citation.replace("RU~", "Foreign -RU-" )


  #citation = citation.replace("~QQ~", "|" + "\n" + str(docID) +"|"+ "Foreign -QQ-")

  #These are just end of citation fixes-DONT TOUCH THESE
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
  citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
  citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other" )
  citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )
  citation = citation.replace("cited by other~cited by other", "cited by other" )
  citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner" )
  citation = citation.replace("cited by other~cited by examiner", "cited by other" )
  citation = citation.replace("cited by examiner~cited by other", "cited by examiner" )

  citation = citation.replace("~", "|" )

  citation = citation.replace("US", "||")

  #make unique post-processing citation changes here-If needed for the end of the scripts
  citation = citation.replace("CA|", "Foreign -CA-" )
  citation = citation.replace("EP|", "Foreign -EP-" )
  citation = citation.replace("CN|", "Foreign -CN-" )
  citation = citation.replace("$", "S")
  citation = citation.replace("D!", "DE")

  #citation = citation.replace(" ", " " )

  #END CITATION FILE-------------------------------------------------------------------------------

  #START the inventors file
  inventor1 = doc.xpath('//applicants/applicant/addressbook/last-name/text()|//applicants/applicant/addressbook/first-name/text()|//applicants/applicant/addressbook/address/city/text()|//applicants/applicant/addressbook/address/state/text()|//applicants/applicant/addressbook/address/country/text()|//applicants/applicant/nationality/*/text()|//applicants/applicant/residence/*/text()|//sequence-cwu/publication-reference/document-id/country/text()|//sequence-cwu/number/text()')
  inventor1 = '~'.join(inventor1).replace('\n-','')

  #For files after 2009 use this to replace State errors in the Excel- If the output is short then use this to add in a None value for State
  inventor1 = inventor1.replace('~KR~omitted','~None~KR~omitted')
  inventor1 = inventor1.replace('~GB~omitted','~None~GB~omitted')
  inventor1 = inventor1.replace('~IT~omitted','~None~IT~omitted')
  inventor1 = inventor1.replace('~JP~omitted','~None~JP~omitted')
  inventor1 = inventor1.replace('~FR~omitted','~None~FR~omitted')
  inventor1 = inventor1.replace('~BR~omitted','~None~BR~omitted')
  inventor1 = inventor1.replace('~NO~omitted','~None~NO~omitted')
  inventor1 = inventor1.replace('~HK~omitted','~None~HK~omitted')
  inventor1 = inventor1.replace('~CA~omitted','~None~CA~omitted')
  inventor1 = inventor1.replace('~TW~omitted','~None~TW~omitted')
  inventor1 = inventor1.replace('~SE~omitted','~None~SE~omitted')
  inventor1 = inventor1.replace('~CH~omitted','~None~CH~omitted')
  inventor1 = inventor1.replace('~DE~omitted','~None~DE~omitted')
  inventor1 = inventor1.replace('~SG~omitted','~None~SG~omitted')
  inventor1 = inventor1.replace('~IN~omitted','~None~IN~omitted')
  inventor1 = inventor1.replace('~IL~omitted','~None~IL~omitted')
  inventor1 = inventor1.replace('~CN~omitted','~None~CN~omitted')
  inventor1 = inventor1.replace('~FI~omitted','~None~FI~omitted')
  inventor1 = inventor1.replace('~ZA~omitted','~None~ZA~omitted')
  inventor1 = inventor1.replace('~NL~omitted','~None~NL~omitted')
  inventor1 = inventor1.replace('~AT~omitted','~None~AT~omitted')
  inventor1 = inventor1.replace('~AU~omitted','~None~AU~omitted')
  inventor1 = inventor1.replace('~BE~omitted','~None~BE~omitted')
  inventor1 = inventor1.replace('~CZ~omitted','~None~CZ~omitted')
  inventor1 = inventor1.replace('~RU~omitted','~None~RU~omitted')
  inventor1 = inventor1.replace('~IE~omitted','~None~IE~omitted')
  inventor1 = inventor1.replace('~AR~omitted','~None~AR~omitted')
  inventor1 = inventor1.replace('~MY~omitted','~None~MY~omitted')
  inventor1 = inventor1.replace('~SK~omitted','~None~SK~omitted')
  inventor1 = inventor1.replace('~ES~omitted','~None~ES~omitted')
  inventor1 = inventor1.replace('~NZ~omitted','~None~NZ~omitted')
  inventor1 = inventor1.replace('~HU~omitted','~None~HU~omitted')
  inventor1 = inventor1.replace('~UA~omitted','~None~UA~omitted')
  inventor1 = inventor1.replace('~DK~omitted','~None~DK~omitted')
  inventor1 = inventor1.replace('~TH~omitted','~None~TH~omitted')
  inventor1 = inventor1.replace('~MX~omitted','~None~MX~omitted')


  #inventor1 = inventor1.replace('~QQ~omitted','~None~QQ~omitted')

  #For the 2005-2008 files use these lines

  inventor1 = inventor1.replace('~NO~NO~NO','~None~NO~NO~NO')
  inventor1 = inventor1.replace('~NZ~NZ~NZ','~None~NZ~NZ~NZ')
  inventor1 = inventor1.replace('~RU~RU~RU','~None~RU~RU~RU')
  inventor1 = inventor1.replace('~RO~RO~RO','~None~RO~RO~RO')
  inventor1 = inventor1.replace('~SE~SE~SE','~None~SE~SE~SE')
  inventor1 = inventor1.replace('~SG~SG~SG','~None~SG~SG~SG')
  inventor1 = inventor1.replace('~SI~SI~SI','~None~SI~SI~SI')
  inventor1 = inventor1.replace('~TH~TH~TH','~None~TH~TH~TH')
  inventor1 = inventor1.replace('~TR~TR~TR','~None~TR~TR~TR')
  inventor1 = inventor1.replace('~TW~TW~TW','~None~TW~TW~TW')
  inventor1 = inventor1.replace('~VE~VE~VE','~None~VE~VE~VE')
  inventor1 = inventor1.replace('~ZA~ZA~ZA','~None~ZA~ZA~ZA')
  inventor1 = inventor1.replace('~AN~AN~AN','~None~AN~AN~AN')
  inventor1 = inventor1.replace('~AR~AR~AR','~None~AR~AR~AR')
  inventor1 = inventor1.replace('~BA~BA~BA','~None~BA~BA~BA')
  inventor1 = inventor1.replace('~PH~PH~PH','~None~PH~PH~PH')
  inventor1 = inventor1.replace('~HR~HR~HR','~None~HR~HR~HR')
  inventor1 = inventor1.replace('~LT~LT~LT','~None~LT~LT~LT')
  inventor1 = inventor1.replace('~EE~EE~EE','~None~EE~EE~EE')
  inventor1 = inventor1.replace('~BJ~BJ~BJ','~None~BJ~BJ~BJ')
  inventor1 = inventor1.replace('~CR~CR~CR','~None~CR~CR~CR')
  inventor1 = inventor1.replace('~PL~PL~PL','~None~PL~PL~PL')
  inventor1 = inventor1.replace('~CO~CO~CO','~None~CO~CO~CO')
  inventor1 = inventor1.replace('~UA~UA~UA','~None~UA~UA~UA')
  inventor1 = inventor1.replace('~KW~KW~KW','~None~KW~KW~KW')
  inventor1 = inventor1.replace('~CL~CL~CL','~None~CL~CL~CL')
  inventor1 = inventor1.replace('~CY~CY~CY','~None~CY~CY~CY')
  inventor1 = inventor1.replace('~LI~LI~LI','~None~LI~LI~LI')
  inventor1 = inventor1.replace('~SA~SA~SA','~None~SA~SA~SA')

  #inventor1 = inventor1.replace('~QQ~QQ~QQ','~None~QQ~QQ~QQ')

  #For lines that don't return use these lines in the code for 2009-
  inventor1 = inventor1.replace('omitted~US~','omitted~US' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~FR~','omitted~FR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~DK~','omitted~DK' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~KR~','omitted~KR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~JP~','omitted~JP' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~GB~','omitted~GB' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~IT~','omitted~IT' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~CH~','omitted~CH' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~SG~','omitted~SG' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~DE~','omitted~DE' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~IN~','omitted~IN' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~TW~','omitted~TW' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('omitted~CN~','omitted~CN' +"|"+ '\n' + str(docID) +"|")


  #inventor1 = inventor1.replace('omitted~QQ~','omitted~QQ' +"|"+ '\n' + str(docID) +"|")

  #for lines 2005-2008 use this line for returning countries
  inventor1 = inventor1.replace('AT~AT~AT~','AT~AT~AT' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AN~AN~AN~','AN~AN~AN' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AR~AR~AR~','AR~AR~AR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AU~AU~AU~','AU~AU~AU' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('AZ~AZ~AZ~','AZ~AZ~AZ' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BA~BA~BA~','BA~BA~BA' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BE~BE~BE~','BE~BE~BE' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BR~BR~BR~','BR~BR~BR' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('BS~BS~BS~','BS~BS~BS' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('CA~CA~CA~','CA~CA~CA' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('CH~CH~CH~','CH~CH~CH' +"|"+ '\n' + str(docID) +"|")
  inventor1 = inventor1.replace('CN~CN~CN~','CN~CN~CN' +"|"+ '\n' + str(docID) +"|")


  #inventor1 = inventor1.replace('QQ~QQ~QQ~','QQ~QQ~QQ' +"|"+ '\n' + str(docID) +"|")

  #special case fixes- these are for strange names fixes in the code that may not create the correct amount of columns.
  inventor1 = inventor1.replace('~None~None~NO~','~None~NO~')
  inventor1 = inventor1.replace('Ramandeep~Chandigarh','Ramandeep|None~Chandigarh')
  inventor1 = inventor1.replace('Esk~eh~r','Eskehr')
  inventor1 = inventor1.replace('Baychar~Eastport','Baychar~None~Eastport')

  inventor1 = inventor1.replace('US~1', '||||||')
  inventor1 = inventor1.replace('~','|') 

  #End the inventor file
  #-------------------------------------------------------------------------------

  #Here are the output print fields- you can change one if you want but remember to comment out all but the one you wish to view.
  print "DocID: {0}\nGrantDate: {1}\nApplicationDate: {2}\nNumber of Claims: {3}\nExaminers: {4}\nAssignee: {5}\nInventor: {6}\nUS Cl.: {7}\n".format(docID,grantdate,applicationdate,claimsNum,examiners.encode("UTF-8"),assignees,inventors,uscl1)
  #print "DocID: {0}\nU.S Cl: {1}\nPrimary: {2}\n".format(docID,uscl2,primary1)
  #print "DocID: {0}\nCitation: {1}\n".format(docID,citation.encode("UTF-8"))
  #print "DocID:    {0}\nTitle:    {1}\nInventors: {2}\n".format(docID,appID,inventor1.encode("UTF-8"))

  #------------------------------------------------------------------------------- IGNORE Everything else below this.
  #Output first general info bits
  outFile.write(str(docID) +"|"+ str(grantdate) +"|"+ str(applicationdate) + "|"+ str(claimsNum) + "|"+ str(examiners.encode("UTF-8")) + "|"+ str(uscl1) + "|"+ str(assignees) + "|"+ str(inventors)  +"|"+"\n")

  #Output Classifications only
  outFile2.write(str(docID) +"|"+ str(uscl2) +"|"+ "\n")

  #Output Citations only
  outFile3.write(str(docID) +"|"+ str(citation) +"|"+"\n")

  #Output inventors only
  outFile4.write(str(docID)  + "|"+ str(inventor1.encode("UTF-8")) + "|" +"\n")


outFile.close()
outFile2.close()
outFile3.close()
outFile4.close()
print "output files complete"

score 1 · Accepted Answer

あなたが見ている問題はPythonの問題ではありません。このコードはzipファイルを解凍し、その中にxmlファイルが見つかることを期待しています。assertステートメントは、xmlファイルが見つかったことを確認するためのchackステートメントです。xmlファイルが見つからない場合にプログラムを停止するように設計されています。割り当てられたzipファイルをダウンロードするdatasrcと、空のzipファイルが見つかります。xmlファイルを見つけようとしても見つからないので、xml_file = None。次に、ステートメントに到達するassertと、アサーションエラーが発生します。

おそらくを取り出してassertコードをうまく実行することもできますが、プログラムがクラッシュすると、その理由がわかりません。そこにあると、いつ、どこで、そしてなぜそれが起こるのかを見つけるための便利な方法が得られます。

python - Pythonを使用してXMLファイルを解析する

1 に答える 1

Related

Reference