このページはひどく無効な XML であるため、parse()
fromlxml.tree
に続けてxpath
etc を使用する通常のアプローチはすべて無残に失敗します。だからあなたの最善の策は次のようなものです:
>>> import re
>>> import urllib
>>> import pprint
>>> s = urllib.urlopen("http://www.rob389.com/dp/tr/11/9789754681383").read()
>>> magic = re.compile(r'tOBJ.DATA\[0\].([A-Z0-9_]+)="([^"]+)"')
>>> my_dict = dict(magic.findall(s))
>>> pprint.pprint(my_dict)
{'DISC_PERC': '15.0000000000',
'EXCHANGE_RT': '2.2815',
'LNK_PREFIX': 'uykusuz-bir-gece-jill-murphy',
'LST_PRICE': '7.500000000000000',
'LST_YAX02_CODE': 'YTL',
'MMG00_CODE': '11',
'MMG00_TITLE': 'Kitap',
'MMM00_DESC': '...<br />Cad\xfdlar Okulu M\xfcd\xfcr\xfc, \\',
'MMM00_DESC250': '...<br />Cad\xfdlar Okulu M\xfcd\xfcr\xfc, \\',
'MMM00_DISC_PERC_SAL': '25',
'MMM00_HEIGHT': '19.6',
'MMM00_ITEM_CODE': '9789751028440',
'MMM00_ORG_TITLE': '026512',
'MMM00_SRC_CODE': '9789754681383',
'MMM00_TITLE': 'Uykusuz Bir Gece',
'MMM00_TYPE': 'M',
'MMM00_WEIGHT': '0',
'MMM00_WIDTH': '13.6',
'MMM00_ZHEIGHT': '1',
'MMS03_PRICE_1': '7.500000000000000',
'MMS03_PRICE_2': '0.000000000000000',
'MMS03_PRICE_3': '7.500000000000000',
'MMS03_YAX02_CODE_1': 'YTL',
'MMS03_YAX02_CODE_2': 'YTL',
'MMS03_YAX02_CODE_3': 'YTL',
'NWS01_DESC': "<BR>New Orleans'da do\xf0an Lillian Hellman'\xfdn ilkgen\xe7li\xf0i daha sonra oyunlar\xfdnda \xfcst\xfc kapal\xfd olarak yer bulacak olan tuhaf ve h\xfdrsl\xfd akrabalar aras\xfdnda ge\xe7ti. New Orleans ve New York aras\xfdnda mekik dokuyarak ge\xe7en y\xfdllarda etraf\xfdndaki farkl\xfd k\xfclt\xfcrleri g\xf6zlemleme \xfeans\xfd buldu. Liseyi bitirdikten sonra Columbia ve New York \xdcniversitesi'ne devam ettiyse de, e\xf0itimini yar\xfdda b\xfdrakarak bir yay\xfdnevinde \xe7al\xfd\xfemaya ba\xfelad\xfd. 1920'lerin bohem hayat\xfdna g\xf6z k\xfdrpt\xfd\xf0\xfd bu d\xf6nemde tan\xfd\xfet\xfd\xf0\xfd gen\xe7 yazar Arthur Kober ile evlenerek Hollywood'a ta\xfe\xfdnd\xfd. <BR><BR>1930'lar\xfdn ba\xfe\xfdnda MGM'de d\xfczeltmenlik yapt\xfd. Hevesli bir solcu oldu\xf0u bu y\xfdllarda, i\xfe arkada\xfelar\xfdn\xfd sendikala\xfemalar\xfd i\xe7in<A class=A2 href=\\",
'NWS01_DESC400': '<A class=A3 href=\\',
'NWS01_ID': '588',
'NWS01_IMAGE': '/UD_OBJS/IMAGES/NWS/HSTTR/Hellman_L_231204_365_1.jpg',
'ON_ESHOP': 'T',
'PEP01_ID': '229016',
'PEP01_NAME': 'Jill Murphy',
'PRD_FNM01_ID': '23462',
'PRD_FNM01_TITLE': 'Mandolin',
'PRD_FNM01_TRD_TITLE': 'Say Yay\xfdnlar\xfd',
'PUR_VAT_VALUE': '8',
'SAL_PRICE': '6.3750000000',
'SAL_VAT_VALUE': '8',
'SAL_YAX02_CODE': 'YTL',
'UD_10': '~410~|',
'UD_10_VAL': 'T\xfcrk\xe7e',
'UD_11': '~1000~|~803.2~|',
'UD_11_VAL': '\xc7ocuk,\xd6yk\xfc',
'UD_12': '~1000.4080~|',
'UD_12_VAL': '\xc7ocuk | 07-12 Ya\xfe | Edebiyat',
'UD_15': '978-975-468-138-3',
'UD_15_VAL': '978-975-468-138-3',
'UD_16': '~PB~|',
'UD_16_VAL': 'Karton Kapak',
'UD_19': '01/01/2010',
'UD_19_VAL': '01/01/2004',
'UD_2': 'The Worst Witch Strikes Again',
'UD_20': '92',
'UD_20_VAL': '92',
'UD_21': '52',
'UD_21_VAL': '52',
'UD_22': '3',
'UD_22_VAL': '3',
'UD_23': '1',
'UD_23_VAL': '1',
'UD_24': '~HM1~|',
'UD_24_VAL': '1. Hamur',
'UD_26': '7-12',
'UD_26_VAL': '07-12',
'UD_2_VAL': 'The Worst Witch Strikes Again',
'UD_3': '~229016~|',
'UD_30': '1',
'UD_30_VAL': '1',
'UD_31': '1',
'UD_31_VAL': '1',
'UD_34': '~1~|',
'UD_34_VAL': '1-3 G\xfcn',
'UD_36': '1',
'UD_36_VAL': '1',
'UD_39': 'VAR',
'UD_39_VAL': 'Var',
'UD_3_VAL': 'Jill Murphy',
'UD_42': '~410~|',
'UD_42_VAL': 'T\xfcrk\xe7e',
'UD_6': '~239986~|',
'UD_6_VAL': 'Seza Sunar',
'YAX02_CODE': 'EUR'}
>>>