python + request + lxml的几个例子

例子没有加入失败后重做的功能，这个也可以考虑增加。
第三个例子加入了访问频率控制
遍历图片的例子加入多线程，明显爬得快很多

解析163新闻列表的例子：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

Python代码

#!/usr/bin/python

# encoding=gbk

# 我只是尝试遍历新闻而已，只有有很多链接博客，主题之类的没有操作

# 如果要实现，就自己判断url来分析到底是什么，然后做相应的处理

import sys

import requests

import datetime

import time

import MySQLdb

import chardet

import lxml.html.soupparser as soupparser

import lxml.etree as etree

start_datetime = datetime.datetime.now()

def parseFromWin1252(str):

# 因为新闻有一些是乱码，编码是windows-1252，需要转换成GBK

#print len(tt.decode("ISO-8859-1").encode("windows-1252").decode("GBK"))

#print len(tt)

try:

return str.encode("windows-1252").decode("GBK")

except UnicodeEncodeError:

#print "UnicodeEncodeError"

return str

except UnicodeDecodeError:

#print "UnicodeDecodeError"

return str

def resolveAndSaveNewContentFromLink(link, linkTitle, cursor):

# 打开一个链接，并得到里面的内容

# 有两种情况无法得到，1.没有标题的，可能是一个主题的页面；2.报异常的，还没处理，所以无法拿到内容

print u"处理:", link

request = requests.get(link)

try:

dom = soupparser.fromstring(request.content)

body = dom[0]

titles = body.xpath("//h1[@id='h1title']")

if len(titles) > 0:

#有标题

title = parseFromWin1252(titles[0].text)

print u"@TITLE:", request.encoding, title, link

newContents = body.xpath("//div[@id='endText']//p")

alist = []

for content in newContents:

if content.text != None:

alist.append(content.text)

text = parseFromWin1252("<br><br>".join(alist))

values = [link, title, text, "Success"]

cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)

else:

#无标题

title = parseFromWin1252(linkTitle)

print u"#NO_TITLE:", request.encoding, title, link

values = [link, title, "", "NO_TITLE"]

cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)

except TypeError:

#报异常

title = parseFromWin1252(linkTitle)

print u"$TypeError:", request.encoding, title, link

values = [link, title, "", "TypeError"]

cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)

#定义方法

def resolveAndSaveLinks(body, cursor):

print u"解析html的Link"

links = body.xpath("//ul[@class='mod-list main-list']//a")

print u"处理数据"

count = 1;

for item in links:

# 有em标签的无法解析

if item.text != None:

values = [item.get("href"), item.text]

cursor.execute("insert into links(url,text) value(%s,%s)", values)

resolveAndSaveNewContentFromLink(item.get("href"), item.text, cursor)

#time.sleep(100) #是否需要暂停，免得被封掉?

print u"完成","<resolveAndSaveLinks>[%s:%s]" %(len(links), count)

count = count + 1

print "----------------------------------------------------------"

print u"保存数据完成,记录数[", len(links), "]"

def resolveAndSaveEmInLinks(body, cursor):

print u"解析html的包含em元素的Link"

ems = body.xpath("//ul[@class='mod-list main-list']//em")

print u"处理数据"

count = 1;

for item in ems:

values = [item.getparent().get("href"), item.text]

cursor.execute("insert into links(url,text) value(%s,%s)", values)

resolveAndSaveNewContentFromLink(item.getparent().get("href"), item.text, cursor)

#time.sleep(100) #是否需要暂停，免得被封掉?

print u"完成","<resolveAndSaveEmInLinks>[%s:%s]" %(len(ems), count)

count = count + 1

print "----------------------------------------------------------"

print u"保存数据完成,记录数[", len(ems), "]"

def resolve():

print u"打开链接"

req = requests.get("http://news.163.com/")

content = req.content

dom = soupparser.fromstring(content)

body = dom[1]

print u"链接数据库"

conn = MySQLdb.connect(host="192.168.0.196", user="root", passwd="", db="python", charset="utf8")

cursor = conn.cursor()

cursor.execute("delete from links")

cursor.execute("delete from texts")

#resolveAndSaveNewContentFromLink("http://auto.163.com/13/0929/02/99TGSGRJ00084TUR.html", u"测试", cursor)

#if True:

# return

print u"解析并保存到数据库"

#遍历不包含em标签的link

resolveAndSaveLinks(body, cursor)

#遍历包含em标签的link

resolveAndSaveEmInLinks(body, cursor)

cursor.close()

conn.close()

print u"遍历完成"

#开始调用

resolve()

end_datetime = datetime.datetime.now()

print u"耗时", (end_datetime - start_datetime).seconds, u"秒"

遍历糗事百科的文章，只遍历导航上面的几个分类，热门，最新，等等

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

Python代码

#!/usr/bin/ScanningQiuShiBaiKe.py

# encoding=gbk

import sys

import os

import MySQLdb

import requests

import datetime

import time

import lxml.html.soupparser as soupparser

import lxml.etree as etree

currentPageId = "currentPageId"

def getImageFile(imgUrl): #文件下载，并写入本地硬盘，返回文件名

local_filename = imgUrl.split('/')[-1]

local_filename= "/home/pandy/tmp/"+local_filename

print u"下载文件成功: ", local_filename

r = requests.get(imgUrl, stream=True) # here we need to set stream = True parameter

with open(local_filename, 'wb') as f:

for chunk in r.iter_content(chunk_size=1024):

if chunk: # filter out keep-alive new chunks

f.write(chunk)

f.flush()

f.close()

return local_filename

return None

def scannintArticle(cursor, type, url, article): #处理一个主题的信息

articleStr = etree.tostring(article)

articleBody = soupparser.fromstring(articleStr)

details = articleBody.xpath("//div[@class='detail']")

authors = articleBody.xpath("//div[@class='author']")

contents = articleBody.xpath("//div[@class='content']")

thumbs = articleBody.xpath("//div[@class='thumb']")

values = [type, url]

if len(details) > 0:

detailStr = etree.tostring(details[0])

detail = soupparser.fromstring(detailStr)

values.append(detail.xpath("//a")[0].text)

values.append(detail.xpath("//a")[0].get("href"))

else:

values.append("")

if len(authors) > 0:

authorStr = etree.tostring(authors[0])

author = soupparser.fromstring(authorStr)

values.append(author.xpath("//a")[0].text)

values.append(author.xpath("//a")[0].get("href"))

else:

values.append("")

if len(contents) > 0:

contentStr = etree.tostring(contents[0])

values.append(contents[0].text)

else:

values.append("")

if len(thumbs) > 0:

thumbStr = etree.tostring(thumbs[0])

thumb = soupparser.fromstring(thumbStr)

imgUrl = thumb.xpath("//img")[0].get("src")

values.append(imgUrl)

#下载图片，先临时存放，然后在读取出来保存到数据库，并删除

local_filename = getImageFile(imgUrl)

f = open( local_filename , "rb" )

b = f.read()

f.close()

os.remove(local_filename)

values.append(MySQLdb.Binary(b))

else:

values.append("")

values.append(None)

values.append("Success")

print values

cursor.execute(

"INSERT INTO qs_article ( type, url, detial_link, detail, user_link, user, content,img, img_content,status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",

values)

def scanning4typeArticle(cursor, type, url): #扫描一页

request = requests.get(url)

#print request.encoding

print url

#print len(request.content)

#print request.content

try:

dom = soupparser.fromstring(request.content)

body = dom[1]

#查找一页下面的主题

articleList = body.xpath("//div[@class='block untagged mb15 bs2']")

for article in articleList:

scannintArticle(cursor, type, url, article)

except:

print "Error"

values = [type, url, '', '', '', '', '', '',None, "Error"]

cursor.execute(

"INSERT INTO qs_article ( type, url, detial_link, detail, user_link, user, content,img, img_content, status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",

values)

def scanning4type(cursor, type, url, subfix): #得到分页数，然后一页一页的打开

print u"开始扫描文章"

request = requests.get(url);

dom = soupparser.fromstring(request.content)

body = dom[0]

#得到底部分页的最大值

pagebars = body.xpath("//div[@class='pagebar']/a")

if len(pagebars) > 2:

maxPageSize = int(pagebars[len(pagebars) - 2].text) + 1

#一页一页的打开

for i in range(1, maxPageSize):

scanningUrl = "".join([url, subfix]).replace(currentPageId, str(i))

scanning4typeArticle(cursor, type, scanningUrl)

print u"扫描文章完成"

def main(): # 主方法

#打开数据库

conn = MySQLdb.connect(host="192.168.0.196", user="root", passwd="", db="python", charset="utf8")

cursor = conn.cursor()

cursor.execute("delete from qs_article")

#扫描几个类型，就是导航的前几个分类

scanning4type(cursor, "8HR", "http://www.qiushibaike.com/8hr", "".join(["/page/", "currentPageId", "?s=4602020"]))

#scanning4type(cursor, "HOT", "http://www.qiushibaike.com/hot", "".join(["/page/", "currentPageId", "?s=4602057"]))

#scanning4type(cursor, "IMGRANK", "http://www.qiushibaike.com/imgrank", "".join(["/page/", "currentPageId", "?s=4602057"]))

#scanning4type(cursor, "LATE", "http://www.qiushibaike.com/late", "".join(["/page/", "currentPageId", "?s=4602057"]))

#scanning4typeArticle(cursor, type, "http://www.qiushibaike.com/late/page/346?s=4602057")

#关闭数据库

cursor.close()

conn.close()

#开始运行主程序

main()

遍历新浪一些博客的图片,加入了访问频率控制

100

101

102

103

104

Python代码

#!/usr/bin/python

# encoding=gbk

#http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=3&type=2&blogid=67f899b332002zdw&ch=

import sys

import os

import requests

import MySQLdb

import lxml.html.soupparser as soupparser

import lxml.etree as etree

import json

import time

maxPage = 100 # 定义被扫描的最大页数

requests.adapters.DEFAULT_RETRIES = 5

#加入控制打开频率

DEFAULT_OPEN_PAGE_FREQUENCY = 1 #打开页面的间隔事件

DEFAULT_OPEN_IMAGE_FREQUENCY = 3 #打开图片页面的间隔事件

DEFAULT_IMAGE_COUNT = 0 #图片计数器

DEFAULT_IMAGE_SIZE = 20 #打开size张图片后，要sleep DEFAULT_OPEN_IMAGE_FREQUENCY秒钟

def saveImage(title, imageSrc): # 保存图片

if title == None:

title = u"无题"

print u"标题:%s 图片:%s" % (title, imageSrc)

dirStr = u"/mnt/E/新浪图集/" + title + "/"

if not os.path.exists(dirStr):

os.makedirs(dirStr)

fileName = imageSrc.split('/')[-1]

request = requests.get(imageSrc, stream=True)

with open(dirStr + fileName, "wb") as file:

for chunk in request.iter_content(chunk_size=1024):

if chunk: # filter out keep-alive new chunks 5

file.write(chunk)

file.flush()

file.close()

def listPicPage(pageUrl): #从首页打开链接，然后进行图片的页面

global DEFAULT_IMAGE_COUNT

request = requests.get(pageUrl)

dom = soupparser.fromstring(request.content)

body = dom[1]

title = body.xpath("//h3[@class='title']")

titleStr = "";

if len(title) > 0:

titleStr = title[0].text

imageList = body.xpath("//div[@class='imgArea']/img[@class='qImg']")

print u"遍历图片页面，标题:%s, 地址: %s " % (titleStr, pageUrl)

imageSrc = None

for image in imageList:

# 这里好像有两个地址，先用real_src，否在用src

if image.get("real_src") != None:

imageSrc = image.get("real_src")

else:

imageSrc = image.get("src")

#要存在图片地址，才需要继续解析

if imageSrc != None:

saveImage(titleStr, imageSrc)

#访问频率控制

DEFAULT_IMAGE_COUNT = DEFAULT_IMAGE_COUNT + 1

if DEFAULT_IMAGE_COUNT % DEFAULT_IMAGE_SIZE == 0:

print u"图片计数:%s, 休息 %s 秒钟后继续\n" % (DEFAULT_IMAGE_COUNT, DEFAULT_OPEN_IMAGE_FREQUENCY)

time.sleep(DEFAULT_OPEN_IMAGE_FREQUENCY)

def listPicIndex(): #遍历首页

# 根据页数来打开url

for i in range(1, maxPage + 1):

url = "http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=" + str(

i) + "&type=2&blogid=67f899b332002zdw&ch="

request = requests.get(url)

json_obj = json.loads(request.content)

for item in json_obj["data"]["list"]:

#找到这一页的所有图片链接，然后进行打开这个链接，才是显示图片的页面

dom = soupparser.fromstring(item)

link = dom.xpath("//a[@class='pic']")

if len(link) > 0:

#遍历图片的页面

listPicPage(link[0].get("href"))

print u"---------------------------------------------完成一个图片链接, 页数:", i

#访问频率控制

# time.sleep(DEFAULT_OPEN_PAGE_FREQUENCY)

print u"---------------------------------------------完成页数", maxPage, ":", i

def main():

listPicIndex()

#listPicPage("http://qing.blog.sina.com.cn/tj/a1509eee330044am.html")

if __name__ == "__main__":

main()

上面的例子改成多线程

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

Python代码

#!/usr/bin/python

# encoding=gbk

#http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=3&type=2&blogid=67f899b332002zdw&ch=

import sys

import os

import requests

import MySQLdb

import lxml.html.soupparser as soupparser

import lxml.etree as etree

import json

import time

import threading

MAX_PAGE = 100 # 定义被扫描的最大页数

MAX_ERROR = 10 # 定义线程允许出现的最大错误数，当不超过这个数字的时候，会自动继续重试

PAGE_SIZE = 5 #段数

DEFAULT_OPEN_PAGE_FREQUENCY = 2 #完成一页休眠的时间

DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY = 5 #出现异常之后等待重试的事件

requests.adapters.DEFAULT_RETRIES = 5

def saveImage(thName, title, imageSrc, currentPath): # 保存图片

if title == None:

title = u"无题"

print u"线程名称:%s, 页码:%s, 标题:%s 图片:%s" % (thName, currentPath, title, imageSrc)

dirStr = u"/mnt/E/新浪图集/" + title + "/"

if not os.path.exists(dirStr):

os.makedirs(dirStr)

fileName = imageSrc.split('/')[-1]

request = requests.get(imageSrc, stream=True)

with open(dirStr + fileName, "wb") as file:

for chunk in request.iter_content(chunk_size=1024):

if chunk: # filter out keep-alive new chunks 5

file.write(chunk)

file.flush()

file.close()

def listPicPage(thName, pageUrl, currentPath): #从首页打开链接，然后进行图片的页面

global DEFAULT_IMAGE_COUNT

request = requests.get(pageUrl)

dom = soupparser.fromstring(request.content)

body = dom[1]

title = body.xpath("//h3[@class='title']")

titleStr = "";

if len(title) > 0:

titleStr = title[0].text

imageList = body.xpath("//div[@class='imgArea']/img[@class='qImg']")

#print u"\n\n页码:%s, 遍历图片页面，标题:%s, 地址: %s " % (currentPath, titleStr, pageUrl)

imageSrc = None

for image in imageList:

# 这里好像有两个地址，先用real_src，否在用src

if image.get("real_src") != None:

imageSrc = image.get("real_src")

else:

imageSrc = image.get("src")

#要存在图片地址，才需要继续解析

if imageSrc != None:

saveImage(thName, titleStr, imageSrc, currentPath)

def listPicIndex(thName, startPath, endPath): #遍历首页

# 根据页数来打开url

for i in range(startPath, endPath + 1):

url = "http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=" + str(

i) + "&type=2&blogid=67f899b332002zdw&ch="

print url

request = requests.get(url)

json_obj = json.loads(request.content)

error_count = 0

for item in json_obj["data"]["list"]:

#找到这一页的所有图片链接，然后进行打开这个链接，才是显示图片的页面

dom = soupparser.fromstring(item)

link = dom.xpath("//a[@class='pic']")

if len(link) > 0:

#遍历图片的页面

try:

listPicPage(thName, link[0].get("href"), i)

except:

if error_count < MAX_ERROR:

error_count = error_count + 1

#错先错误的话，等待一会儿，再重试

print u"---------------------------------------------休眠%s秒钟后重试, 页数:%s" % (

DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY, i)

time.sleep(DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY)

listPicPage(thName, link[0].get("href"), i)

else:

print u"出错超过预设次数，退出爬虫。"

#print u"---------------------------------------------完成一个图片链接, 页数:", i

#访问频率控制

time.sleep(DEFAULT_OPEN_PAGE_FREQUENCY)

print u"---------------------------------------------完成页数", MAX_PAGE, ":", i

return True

class MyThread(threading.Thread):

def __init__(self, name, startPath, endPage):

threading.Thread.__init__(self)

self.name = name

self.is_stop = False

self.startPage = startPath

self.endPage = endPage

def run(self):

while not self.is_stop:

#遍历完成后停止线程

self.is_stop = listPicIndex(self.name, self.startPage, self.endPage)

def stop(self): #手动设置停止标记

self.is_stop = True

if __name__ == "__main__":

#分段创建线程

count=1;

for i in range(1, MAX_PAGE, PAGE_SIZE):

startPath = i

endPath = i + PAGE_SIZE

if endPath > MAX_PAGE:

endPath = MAX_PAGE

print startPath, ",", endPath

t = MyThread("Thread " + str(count), startPath, endPath)

count=count+1

t.start()

pass

不静之心

python + request + lxml的几个例子

发表评论取消回复

访问信息

功能

近期评论

不静之心

python + request + lxml的几个例子

发表评论 取消回复

访问信息

功能

近期评论

发表评论取消回复