09-渲染类
[toc]
5.4 渲染类
为了提升这些功能后续的易用性,下面会把使用到的方法封装到一个类中,其源代码可以从本书源码文件的chp5文件夹中找到,其名为 browser_render.py 。
import time
class BrowserRender(QWebView):
def __init__(self, show=True):
self.app = QApplication(sys.argv)
QWebView.__init__(self)
if show:
self.show() # show the browser
def download(self, url, timeout=60):
"""Wait for download to complete and return result"""
loop = QEventLoop()
timer = QTimer()
timer.setSingleShot(True)
timer.timeout.connect(loop.quit)
self.loadFinished.connect(loop.quit)
self.load(QUrl(url))
timer.start(timeout * 1000)
loop.exec_() # delay here until download finished
if timer.isActive():
# downloaded successfully
timer.stop()
return self.html()
else:
# timed out
print 'Request timed out: ' + url
def html(self):
"""Shortcut to return the current HTML"""
return self.page().mainFrame().toHtml()
def find(self, pattern):
"""Find all elements that match the pattern"""
return self.page().mainFrame().findAllElements(pattern)
def attr(self, pattern, name, value):
"""Set attribute for matching elements"""
for e in self.find(pattern):
e.setAttribute(name, value)
def text(self, pattern, value):
"""Set attribute for matching elements"""
for e in self.find(pattern):
e.setPlainText(value)
def click(self, pattern):
"""Click matching elements"""
for e in self.find(pattern):
e.evaluateJavaScript("this.click()")
def wait_load(self, pattern, timeout=60):
"""Wait until pattern is found and return matches"""
deadline = time.time() + timeout
while time.time() < deadline:
self.app.processEvents()
matches = self.find(pattern)
if matches:
return matches
print('Wait load timed out')
你可能已经注意到,在 download() 和 wait_load() 方法中增加了一些代码用于处理定时器。定时器用于跟踪等待时间,并在截止时间到达时取消事件循环。否则,当出现网络问题时,事件循环就会无休止地运行下去。
下面是使用这个新实现的类抓取搜索页面的代码。
>>> br = BrowserRender()
>>> br.download('http://example.python-scraping.com/search')
>>> br.attr('#search_term', 'value', '.')
>>> br.text('#page_size option:checked', '1000')
>>> br.click('#search')
>>> elements = br.wait_load('#results a')
>>> countries_or_districts = [e.toPlainText().strip() for e in elements]
>>> print countries_or_districts
['Afghanistan', 'Aland Islands', ... , 'Zambia', 'Zimbabwe']