首页 > html教程 > 正文

「松勤软件自动化测试」Python的解析HTML的几种操作方式

转载 2019-02-11 0 4
「松勤软件自动化测试」Python的解析HTML的几种操作方式

遇见松勤,高薪终会不期而遇!


解析HTML是爬虫后的重要的一个处理数据的环节。以下记录解析HTML的几种方式。

先介绍基础的辅助函数,主要用于获取HTML并输入解析后的结束

#把传递解析函数,便于下面的修改

def get_html(url,paraser = bs4_paraser):

headers = {

'接受':'* / *',

'Accept-Encoding':'gzip,deflate,sdch',

'接受 - 语言':'zh-CN,zh; q = 0.8',

'主持人':'www.360kan.com',

'代理连接':'保持活力',

'User-Agent':'Mozilla / 5.0(Windows NT 6.1; WOW64)AppleWebKit / 537.36(KHTML,像Gecko)Chrome / 52.0.2743.116 Safari / 537.36'

}

request = urllib2.Request(url,headers = headers)

response = urllib2.urlopen(request)

response.encoding ='utf-8'

如果response.code == 200:

data = StringIO.StringIO(response.read())

gzipper = gzip.GzipFile(fileobj = data)

data = gzipper.read()

value = paraser(data)#open('E:/h5/haPkY0osd0r5UB.html')。read()

返回值

其他:

通过

value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html',paraser = lxml_parser)

对于行值:

打印行

1,lxml.html的方式进行解析,

lxml XML工具包是C库libxml2和libxslt的Pythonic绑定。它的独特之处在于它将这些库的速度和XML特性完整性与原生Python API的简单性相结合,大多数兼容但优于众所周知的ElementTree API。最新版本适用于从2.6到3.5的所有CPython版本。有关lxml项目的背景和目标的更多信息,请参阅简介。常见问题解答中回答了一些常见问题。

[官网(http://lxml.de/)

def lxml_parser(页面):

data = []

doc = etree.HTML(页面)

all_div = doc.xpath('// div [@ class =“yingping-list-wrap”]')

对于all_div中的行:

#获取每一个影评,即影评的项目

all_div_item = row.xpath('。// div [@ class =“item”]')#find_all('div',attrs = {'class':'item'})

对于all_div_item中的r:

值= {}

#获取影评的标题部分

title = r.xpath('。// div [@ class =“g-clear title-wrap”] [1]')

value ['title'] = title [0] .xpath('./ a / text()')[0]

value ['title_href'] = title [0] .xpath('./ a / @ href')[0]

score_text = title [0] .xpath('./ div / span / span / @ style')[0]

score_text = re.search(r'\ d +',score_text).group()

value ['score'] = int(score_text)/ 20

#时间

value ['time'] = title [0] .xpath('./ div / span [@ class =“time”] / text()')[0]

#多少人喜欢

value ['people'] = int(

re.search(r'\ d +',title [0] .xpath('./ div [@ class =“num”] / span / text()')[0])。group())

data.append(值)

返回数据

2,使用BeautifulSoup,不多说了,推荐一篇讲解非常好的文章

[应用讲解](http://www.bkjia.com/Pythonjc/992499.html%20%E5%BA%94%E7%94%A8%E8%AE%B2%E8%A7%A3)

def bs4_paraser(html):

all_value = []

值= {}

汤= BeautifulSoup(html,'html.parser')

#获取影评的部分

all_div = soup.find_all('div',attrs = {'class':'yingping-list-wrap'},limit = 1)

对于all_div中的行:

#获取每一个影评,即影评的项目

all_div_item = row.find_all('div',attrs = {'class':'item'})

对于all_div_item中的r:

#获取影评的标题部分

title = r.find_all('div',attrs = {'class':'g-clear title-wrap'},limit = 1)

如果title不是None而len(title)> 0:

value ['title'] = title [0] .a.string

value ['title_href'] = title [0] .a ['href']

score_text = title [0] .div.span.span ['style']

score_text = re.search(r'\ d +',score_text).group()

value ['score'] = int(score_text)/ 20

#时间

value ['time'] = title [0] .div.find_all('span',attrs = {'class':'time'})[0] .string

#多少人喜欢

value ['people'] = int(

re.search(r'\ d +',title [0] .find_all('div',attrs = {'class':'num'})[0] .span.string).group())

#print r

all_value.append(值)

值= {}

返回all_value

3,使用SGMLParser,主要是通过start,end tag的方式进行了,解析工程比较明朗,但是有点麻烦,而且该案例的场景不太适合该方法,(哈哈)

class CommentParaser(SGMLParser):

def __init __(self):

化SGMLParser .__的init __(个体经营)

self .__ start_div_yingping = False

self .__ start_div_item = False

self .__ start_div_gclear =假

self .__ start_div_ratingwrap = False

self .__ start_div_num = False

# 一个

self .__ start_a = False

#span 3中状态

self .__ span_state = 0

#数据

self .__ value = {}

self.data = []

def start_div(self,attrs):

对于k,v在attrs中:

如果k =='class'且v =='yingping-list-wrap':

self .__ start_div_yingping = True

elif k =='class'和v =='item':

self .__ start_div_item = True

elif k =='class'和v =='g-clear title-wrap':

self .__ start_div_gclear = True

elif k =='class'和v =='rating-wrap g-clear':

self .__ start_div_ratingwrap = True

elif k =='class'和v =='num':

self .__ start_div_num = True

def end_div(self):

如果自己.__ start_div_yingping:

if self .__ start_div_item:

如果自我.__ start_div_gclear:

if self .__ start_div_num或self .__ start_div_ratingwrap:

如果是self .__ start_div_num:

self .__ start_div_num = False

if self .__ start_div_ratingwrap:

self .__ start_div_ratingwrap = False

其他:

self .__ start_div_gclear =假

其他:

self.data.append(个体.__值)

self .__ value = {}

self .__ start_div_item = False

其他:

self .__ start_div_yingping = False

def start_a(self,attrs):

如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear:

self .__ start_a = True

对于k,v在attrs中:

如果k =='href':

self .__ value ['href'] = v

def end_a(self):

如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear和self .__ start_a:

self .__ start_a = False

def start_span(self,attrs):

如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear:

if self .__ start_div_ratingwrap:

if self .__ span_state!= 1:

对于k,v在attrs中:

如果k =='class'且v =='rating':

self .__ span_state = 1

elif k =='class'和v =='time':

self .__ span_state = 2

其他:

对于k,v在attrs中:

如果k =='style':

score_text = re.search(r'\ d +',v).group()

self .__ value ['score'] = int(score_text)/ 20

self .__ span_state = 3

elif self .__ start_div_num:

self .__ span_state = 4

def end_span(self):

self .__ span_state = 0

def handle_data(self,data):

如果自己.__ start_a:

self .__ value ['title'] =数据

elif self .__ span_state == 2:

self .__ value ['time'] =数据

elif self .__ span_state == 4:

score_text = re.search(r'\ d +',data).group()

self .__ value ['people'] = int(score_text)

通过

def sgl_parser(html):

parser = CommentParaser()

parser.feed(HTML)

return parser.data

4,HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用,

class CommentHTMLParser(HTMLParser.HTMLParser):

def __init __(self):

HTMLParser.HTMLParser .__的init __(个体经营)

self .__ start_div_yingping = False

self .__ start_div_item = False

self .__ start_div_gclear =假

self .__ start_div_ratingwrap = False

self .__ start_div_num = False

# 一个

self .__ start_a = False

#span 3中状态

self .__ span_state = 0

#数据

self .__ value = {}

self.data = []

def handle_starttag(self,tag,attrs):

如果tag =='div':

对于k,v在attrs中:

如果k =='class'且v =='yingping-list-wrap':

self .__ start_div_yingping = True

elif k =='class'和v =='item':

self .__ start_div_item = True

elif k =='class'和v =='g-clear title-wrap':

self .__ start_div_gclear = True

elif k =='class'和v =='rating-wrap g-clear':

self .__ start_div_ratingwrap = True

elif k =='class'和v =='num':

self .__ start_div_num = True

elif tag =='a':

如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear:

self .__ start_a = True

对于k,v在attrs中:

如果k =='href':

self .__ value ['href'] = v

elif tag =='span':

如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear:

if self .__ start_div_ratingwrap:

if self .__ span_state!= 1:

对于k,v在attrs中:

如果k =='class'且v =='rating':

self .__ span_state = 1

elif k =='class'和v =='time':

self .__ span_state = 2

其他:

对于k,v在attrs中:

如果k =='style':

score_text = re.search(r'\ d +',v).group()

self .__ value ['score'] = int(score_text)/ 20

self .__ span_state = 3

elif self .__ start_div_num:

self .__ span_state = 4

def handle_endtag(self,tag):

如果tag =='div':

如果自己.__ start_div_yingping:

if self .__ start_div_item:

如果自我.__ start_div_gclear:

if self .__ start_div_num或self .__ start_div_ratingwrap:

如果是self .__ start_div_num:

self .__ start_div_num = False

if self .__ start_div_ratingwrap:

self .__ start_div_ratingwrap = False

其他:

self .__ start_div_gclear =假

其他:

self.data.append(个体.__值)

self .__ value = {}

self .__ start_div_item = False

其他:

self .__ start_div_yingping = False

elif tag =='a':

如果自我.__ start_div_yingping和self .__ start_div_item和self .__ start_div_gclear和self .__ start_a:

self .__ start_a = False

elif tag =='span':

self .__ span_state = 0

def handle_data(self,data):

如果自己.__ start_a:

self .__ value ['title'] =数据

elif self .__ span_state == 2:

self .__ value ['time'] =数据

elif self .__ span_state == 4:

score_text = re.search(r'\ d +',data).group()

self .__ value ['people'] = int(score_text)

通过

def html_parser(html):

parser = CommentHTMLParser()

parser.feed(HTML)

return parser.data

3,4对于该案例来说确实是不太适合,趁现在有空记录下来,功学习使用!

遇见松勤,高薪终会不期而遇!

更多干货、学习资料免费领:

松勤网:www.songqinnet.com

松勤软件测试职业交流QQ群:814326044

相关文章


  • 天企网络:网站SEO优化之如何让正确使用HTML标记
  • bootstrap教程html文档版,附一个基于bootstrap的后台管理系统
  • Java小练习用HTML中的标签实现跳转百度官网
  • 南京seo优化之网站html标签优化分享(徐金华seo博客)
  • 第五节html的链接重要组成,和重要元素记住这些就够了
  • HTML之行内标签、块标签、行内块标签(广州中软卓越)
  • 七夕马上要到了,用HTML语言编写一个礼物,送给女朋友吧
  • “我的人生,我做主!”——中科韬睿TR1802班HTML项目验收