2017-06-18 51 views
0

獲取文本我有以下的html代碼:從HTML體

<body class="frontend page-object" data-tealium="{"tmsData":{"ad_type":"Marktplatz","page_type":"Ad_View","vertical_id":"5","vertical":"Marktplatz","ad_title":"LEGO+Technic+8045+-+Mini-Teleskoplader+-+2+in+1","num_pictures":"4","category_level_1":"Spielen+%2F+Spielzeug","region_level_id_2":"9","category_level_3":"Lego","region_level_id_3":"117244","category_level_2":"Lego+%2F+Playmobil","region_level_id_1":"-141","price":"6","product_id":"67","category_level_max":"4","region_level_2":"Wien","region_level_3":"Wien%2C+22.+Bezirk%2C+Donaustadt","category_level_4":"Technic","seller_id":"19284847","region_level_1":"%C3%96sterreich","ad_type_id":"67","category_level_id_3":"5191","category_level_id_2":"5182","category_level_id_1":"5136","category_level_id_4":"5199","environment":"web","ad_id":"208824705","post_code":"1220","event_name":"adview","publish_date":"Sun+Jun+18+18%3A51%3A00+CEST+2017"}}" data-adid="208824705"> 

在這裏,我試圖讓這一類級別:"category_level_1":"Spielen+%2F+Spielzeug"與beautifulsoup。但是,我不能得到它。

如果我這樣做:CatId = soup2.select("html body.frontend.page-object")[0].get_text().strip()我得到整個HTML文本。

CatId = soup2.find("html body.frontend.page-object", {category_level_1})[0].get_text().strip()不給我任何東西。我只需要取Spielen+%2F+Spielzeug任何想法如何解決這個問題?

非常感謝提前。

+0

的'數據tealium的部分=「{」 tmsData」 ...'在你身上的標籤並不像有效的HTML對我來說,也許解決這個將解決您的問題? – glennreyes

+0

無法修復它,它是一個有效的HTML可以在這裏找到:https://www.willhaben.at/iad/kaufen-und-verkaufen/d/lego-technic-8045-mini-teleskoplader-2-in-1-208824705/ – fahrradlaus

回答

0

一種方法使用JavaScript來得到它是:

const category1 = JSON.parse(document.body.getAttribute('data-tealium')).tmsData.category_level_1; 

console.log(category1); 

確保數據tealium始終可用,JSON解析的:

const tealium = document.body.getAttribute('data-tealium'); 
const parsedData = JSON.parse(tealium); 
const category1 = 
    parsedData && 
    parsedData.tmsData && 
    parsedData.tmsData.category_level_1 || null; 

console.log(category1); 
+0

非常感謝,但有沒有一種方法可以直接使用Python? – fahrradlaus

0

我不知道有多少意義的結果會給你,但你可以看到使用Python的領域的內容。

>>> import requests 
>>> page = requests.get('https://www.willhaben.at/iad/kaufen-und-verkaufen/d/lego-technic-8045-mini-teleskoplader-2-in-1-208824705/').content 
>>> import bs4 
>>> soup = bs4.BeautifulSoup(page, 'lxml') 
>>> data_tealium = soup.find('body').attrs['data-tealium'] 
>>> info = eval(data_tealium)['tmsData'] 
>>> for i, item in enumerate(info): 
...  '--->', i 
...  item, info[item] 
... 
('--->', 0) 
('category_level_max', '4') 
('--->', 1) 
('region_level_id_1', '-141') 
('--->', 2) 
('ad_type', 'Marktplatz') 
('--->', 3) 
('seller_id', '19284847') 
('--->', 4) 
('product_id', '67') 
('--->', 5) 
('category_level_id_3', '5191') 
('--->', 6) 
('vertical_id', '5') 
('--->', 7) 
('ad_type_id', '67') 
('--->', 8) 
('region_level_id_3', '117244') 
('--->', 9) 
('category_level_4', 'Technic') 
('--->', 10) 
('region_level_3', 'Wien%2C+22.+Bezirk%2C+Donaustadt') 
('--->', 11) 
('vertical', 'Marktplatz') 
('--->', 12) 
('region_level_id_2', '9') 
('--->', 13) 
('region_level_1', '%C3%96sterreich') 
('--->', 14) 
('post_code', '1220') 
('--->', 15) 
('event_name', 'adview') 
('--->', 16) 
('page_type', 'Ad_View') 
('--->', 17) 
('category_level_1', 'Spielen+%2F+Spielzeug') 
('--->', 18) 
('num_pictures', '4') 
('--->', 19) 
('price', '6') 
('--->', 20) 
('category_level_3', 'Lego') 
('--->', 21) 
('category_level_id_2', '5182') 
('--->', 22) 
('ad_title', 'LEGO+Technic+8045+-+Mini-Teleskoplader+-+2+in+1') 
('--->', 23) 
('publish_date', 'Sun+Jun+18+18%3A51%3A00+CEST+2017') 
('--->', 24) 
('category_level_2', 'Lego+%2F+Playmobil') 
('--->', 25) 
('category_level_id_1', '5136') 
('--->', 26) 
('ad_id', '208824705') 
('--->', 27) 
('region_level_2', 'Wien') 
('--->', 28) 
('category_level_id_4', '5199') 
('--->', 29) 
('environment', 'web')