遇见松勤,高薪终会不期而遇!
解析HTML是爬虫后的重要的一个处理数据的环节。以下记录解析HTML的几种方式。
先介绍基础的辅助函数,主要用于获取HTML并输入解析后的结束
#把传递解析函数,便于下面的修改
def get_html(url,paraser = bs4_paraser):
headers = {
'接受':'* / *',
'Accept-Encoding':'gzip,deflate,sdch',
'接受 - 语言':'zh-CN,zh; q = 0.8',
'主持人':'www.360kan.com',
'代理连接':'保持活力',
'User-Agent':'Mozilla / 5.0(Windows NT 6.1; WOW64)AppleWebKit / 537.36(KHTML,像Gecko)Chrome / 52.0.2743.116 Safari / 537.36'
}
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
response.encoding ='utf-8'
如果response.code == 200:
data = StringIO.StringIO(response.read())
gzipper = gzip.GzipFile(fileobj = data)
data = gzipper.read()
value = paraser(data)#open('E:/h5/haPkY0osd0r5UB.html')。read()
返回值
其他:
通过
value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html',paraser = lxml_parser)
对于行值:
打印行
1,lxml.html的方式进行解析,
lxml XML工具包是C库libxml2和libxslt的Pythonic绑定。它的独特之处在于它将这些库的速度和XML特性完整性与原生Python API的简单性相结合,大多数兼容但优于众所周知的ElementTree API。最新版本适用于从2.6到3.5的所有CPython版本。有关lxml项目的背景和目标的更多信息,请参阅简介。常见问题解答中回答了一些常见问题。
[官网(http://lxml.de/)
def lxml_parser(页面):
data = []
doc = etree.HTML(页面)
all_div = doc.xpath('// div [@ class =“yingping-list-wrap”]')
对于all_div中的行:
#获取每一个影评,即影评的项目
all_div_item = row.xpath('。// div [@ class =“item”]')#find_all('div',attrs = {'class':'item'})
对于all_div_item中的r:
值= {}
#获取影评的标题部分
title = r.xpath('。// div [@ class =“g-clear title-wrap”] [1]')
value ['title'] = title [0] .xpath('./ a / text()')[0]
value ['title_href'] = title [0] .xpath('./ a / @ href')[0]
score_text = title [0] .xpath('./ div / span / span / @ style')[0]
score_text = re.search(r'\ d +',score_text).group()
value ['score'] = int(score_text)/ 20
#时间
value ['time'] = title [0] .xpath('./ div / span [@ class =“time”] / text()')[0]
#多少人喜欢
value ['people'] = int(
re.search(r'\ d +',title [0] .xpath('./ div [@ class =“num”] / span / text()')[0])。group())
data.append(值)
返回数据
2,使用BeautifulSoup,不多说了,推荐一篇讲解非常好的文章
[应用讲解](http://www.bkjia.com/Pythonjc/992499.html%20%E5%BA%94%E7%94%A8%E8%AE%B2%E8%A7%A3)
def bs4_paraser(html):
all_value = []
值= {}
汤= BeautifulSoup(html,'html.parser')
#获取影评的部分
all_div = soup.find_all('div',attrs = {'class':'yingping-list-wrap'},limit = 1)
对于all_div中的行:
#获取每一个影评,即影评的项目
all_div_item = row.find_all('div',attrs = {'class':'item'})
对于all_div_item中的r:
#获取影评的标题部分
title = r.find_all('div',attrs = {'class':'g-clear title-wrap'},limit = 1)
如果title不是None而len(title)> 0:
value ['title'] = title [0] .a.string
value ['title_href'] = title [0] .a ['href']
score_text = title [0] .div.span.span ['style']
score_text = re.search(r'\ d +',score_text).group()
value ['score'] = int(score_text)/ 20
#时间
value ['time'] = title [0] .div.find_all('span',attrs = {'class':'time'})[0] .string
#多少人喜欢
value ['people'] = int(
re.search(r'\ d +',title [0] .find_all('div',attrs = {'class':'num'})[0] .span.string).group())
#print r
all_value.append(值)
值= {}
返回all_value
3,使用SGMLParser,主要是通过start,end tag的方式进行了,解析工程比较明朗,但是有点麻烦,而且该案例的场景不太适合该方法,(哈哈)
class CommentParaser(SGMLParser):
def __init __(self):
化SGMLParser .__的init __(个体经营)
self .__ start_div_yingping = False
self .__ start_div_item = False
self .__ start_div_gclear =假
self .__ start_div_ratingwrap = False
self .__ start_div_num = False
# 一个
self .__ start_a = False
#span 3中状态
self .__ span_state = 0
#数据
self .__ value = {}
self.data = []
def start_div(self,attrs):
对于k,v在attrs中:
如果k =='class'且v =='yingping-list-wrap':
self .__ start_div_yingping = True
elif k =='class'和v =='item':
self .__ start_div_item = True
elif k =='class'和v =='g-clear title-wrap':
self .__ start_div_gclear = True
elif k =='class'和v =='rating-wrap g-clear':
self .__ start_div_ratingwrap = True
elif k =='class'和v =='num':
self .__ start_div_num = True
def end_div(self):
如果自己.__ start_div_yingping:
if self .__ start_div_item:
如果自我.__ start_div_gclear:
if self .__ start_div_num或self .__ start_div_ratingwrap:
如果是self .__ start_div_num:
self .__ start_div_num = False
if self .__ start_div_ratingwrap:
self .__ start_div_ratingwrap = False
其他:
self .__ start_div_gclear =假
其他:
self.data.append(个体.__值)
self .__ value = {}
self .__ start_div_item = False
其他:
self .__ start_div_yingping = False
def start_a(self,attrs):
如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear:
self .__ start_a = True
对于k,v在attrs中:
如果k =='href':
self .__ value ['href'] = v
def end_a(self):
如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear和self .__ start_a:
self .__ start_a = False
def start_span(self,attrs):
如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear:
if self .__ start_div_ratingwrap:
if self .__ span_state!= 1:
对于k,v在attrs中:
如果k =='class'且v =='rating':
self .__ span_state = 1
elif k =='class'和v =='time':
self .__ span_state = 2
其他:
对于k,v在attrs中:
如果k =='style':
score_text = re.search(r'\ d +',v).group()
self .__ value ['score'] = int(score_text)/ 20
self .__ span_state = 3
elif self .__ start_div_num:
self .__ span_state = 4
def end_span(self):
self .__ span_state = 0
def handle_data(self,data):
如果自己.__ start_a:
self .__ value ['title'] =数据
elif self .__ span_state == 2:
self .__ value ['time'] =数据
elif self .__ span_state == 4:
score_text = re.search(r'\ d +',data).group()
self .__ value ['people'] = int(score_text)
通过
def sgl_parser(html):
parser = CommentParaser()
parser.feed(HTML)
return parser.data
4,HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用,
class CommentHTMLParser(HTMLParser.HTMLParser):
def __init __(self):
HTMLParser.HTMLParser .__的init __(个体经营)
self .__ start_div_yingping = False
self .__ start_div_item = False
self .__ start_div_gclear =假
self .__ start_div_ratingwrap = False
self .__ start_div_num = False
# 一个
self .__ start_a = False
#span 3中状态
self .__ span_state = 0
#数据
self .__ value = {}
self.data = []
def handle_starttag(self,tag,attrs):
如果tag =='div':
对于k,v在attrs中:
如果k =='class'且v =='yingping-list-wrap':
self .__ start_div_yingping = True
elif k =='class'和v =='item':
self .__ start_div_item = True
elif k =='class'和v =='g-clear title-wrap':
self .__ start_div_gclear = True
elif k =='class'和v =='rating-wrap g-clear':
self .__ start_div_ratingwrap = True
elif k =='class'和v =='num':
self .__ start_div_num = True
elif tag =='a':
如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear:
self .__ start_a = True
对于k,v在attrs中:
如果k =='href':
self .__ value ['href'] = v
elif tag =='span':
如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear:
if self .__ start_div_ratingwrap:
if self .__ span_state!= 1:
对于k,v在attrs中:
如果k =='class'且v =='rating':
self .__ span_state = 1
elif k =='class'和v =='time':
self .__ span_state = 2
其他:
对于k,v在attrs中:
如果k =='style':
score_text = re.search(r'\ d +',v).group()
self .__ value ['score'] = int(score_text)/ 20
self .__ span_state = 3
elif self .__ start_div_num:
self .__ span_state = 4
def handle_endtag(self,tag):
如果tag =='div':
如果自己.__ start_div_yingping:
if self .__ start_div_item:
如果自我.__ start_div_gclear:
if self .__ start_div_num或self .__ start_div_ratingwrap:
如果是self .__ start_div_num:
self .__ start_div_num = False
if self .__ start_div_ratingwrap:
self .__ start_div_ratingwrap = False
其他:
self .__ start_div_gclear =假
其他:
self.data.append(个体.__值)
self .__ value = {}
self .__ start_div_item = False
其他:
self .__ start_div_yingping = False
elif tag =='a':
如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear和self .__ start_a:
self .__ start_a = False
elif tag =='span':
self .__ span_state = 0
def handle_data(self,data):
如果自己.__ start_a:
self .__ value ['title'] =数据
elif self .__ span_state == 2:
self .__ value ['time'] =数据
elif self .__ span_state == 4:
score_text = re.search(r'\ d +',data).group()
self .__ value ['people'] = int(score_text)
通过
def html_parser(html):
parser = CommentHTMLParser()
parser.feed(HTML)
return parser.data
3,4对于该案例来说确实是不太适合,趁现在有空记录下来,功学习使用!
遇见松勤,高薪终会不期而遇!
更多干货、学习资料免费领:
松勤网:www.songqinnet.com
松勤软件测试职业交流QQ群:814326044