Converting Scrapy To Lxml
I have scrapy code that looks like this for row in response.css('div#flexBox_flex_calendar_mainCal table tr.calendar_row'): print '================' print row.xpath('.
Solution 1:
I like to use lxml
for scraping. I usually do not use its xpath
functionality though and opt for their ElementPath
library instead. It is very similar in syntax. Below is how I would port your scrapy
code.
Going line by line:
initialization:
from lxml import etree
# analogous function xpath(.../text()).extract() for lxml etree nodesdefextract_text(elem):
if elem isNone:
printNoneelsereturn''.join(i for i in elem.itertext())
data = wgetUrl(url+date) # wgetUrl, url, date you defined in your question
tree = etree.HTML(content)
line 1
# originalfor row in response.css("div#flexBox_flex_calendar_mainCal table tr.calendar_row"):
# portedfor row in tree.findall(r'.//div[@id="flexBox_flex_calendar_mainCal"]//table/tr[@class="calendar_row"]'):
line 2
print"================"
line 3
# originalprint row.xpath(".//td[@class='time']/text()").extract()
# portedprint extract_text(row.find(r'.//td[@class="time"]'))
line 4
# originalprint row.xpath(".//td[@class='currency']/text()").extract()
# portedprint extract_text(row.find(r'.//td[@class="currency"]'))
line 5
# originalprint row.xpath(".//td[@class='impact']/span/@title").extract()
# ported
td = row.find(r'.//td[@class="impact"]/span')
if td isnotNoneand'title'in td.attrib:
print td.attrib['title']
line 6
# originalprint row.xpath(".//td[@class='event']/span/text()").extract()
# portedprint extract_text(row.find(r'.//td[@class="event"]/span'))
line 7
# originalprint row.xpath(".//td[@class='actual']/text()").extract()
# portedprint extract_text(row.find(r'.//td[@class="actual"]'))
line 8
# originalprint row.xpath(".//td[@class='forecast']/text()").extract()
# portedprint extract_text(row.find(r'.//td[@class="forecast"]'))
line 9
# originalprint row.xpath(".//td[@class='previous']/text()").extract()
# portedprint extract_text(row.find(r'.//td[@class="previous"]'))
line 10
print"================"
And all together now:
from lxml import etree
defwgetUrl(target):
# same as you defined it# analogous function xpath(.../text()).extract() for lxml etree nodesdefextract_text(elem):
if elem isNone:
printNoneelsereturn''.join(i for i in elem.itertext())
content = wgetUrl(your_url) # wgetUrl as the function you defined in your question
node = etree.HTML(content)
for row in node.findall(r'.//div[@id="flexBox_flex_calendar_mainCal"]//table/tr[@class="calendar_row"]'):
print"================"print extract_text(row.find(r'.//td[@class="time"]'))
print extract_text(row.find(r'.//td[@class="currency"]'))
td = row.find(r'.//td[@class="impact"]/span')
if td isnotNoneand'title'in td.attrib:
print td.attrib['title']
print extract_text(row.find(r'.//td[@class="event"]/span'))
print extract_text(row.find(r'.//td[@class="actual"]'))
print extract_text(row.find(r'.//td[@class="forecast"]'))
print extract_text(row.find(r'.//td[@class="previous"]'))
print"================"
Post a Comment for "Converting Scrapy To Lxml"