python + request + lxml的几个例子
例子没有加入失败后重做的功能,这个也可以考虑增加。
第三个例子加入了访问频率控制
遍历图片的例子加入多线程,明显爬得快很多
第三个例子加入了访问频率控制
遍历图片的例子加入多线程,明显爬得快很多
解析163新闻列表的例子:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
Python代码 #!/usr/bin/python # encoding=gbk # 我只是尝试遍历新闻而已,只有有很多链接博客,主题之类的没有操作 # 如果要实现,就自己判断url来分析到底是什么,然后做相应的处理 import sys import requests import datetime import time import MySQLdb import chardet import lxml.html.soupparser as soupparser import lxml.etree as etree start_datetime = datetime.datetime.now() def parseFromWin1252(str): # 因为新闻有一些是乱码,编码是windows-1252,需要转换成GBK #print len(tt.decode("ISO-8859-1").encode("windows-1252").decode("GBK")) #print len(tt) try: return str.encode("windows-1252").decode("GBK") except UnicodeEncodeError: #print "UnicodeEncodeError" return str except UnicodeDecodeError: #print "UnicodeDecodeError" return str def resolveAndSaveNewContentFromLink(link, linkTitle, cursor): # 打开一个链接,并得到里面的内容 # 有两种情况无法得到,1.没有标题的,可能是一个主题的页面;2.报异常的,还没处理,所以无法拿到内容 print u"处理:", link request = requests.get(link) try: dom = soupparser.fromstring(request.content) body = dom[0] titles = body.xpath("//h1[@id='h1title']") if len(titles) > 0: #有标题 title = parseFromWin1252(titles[0].text) print u"@TITLE:", request.encoding, title, link newContents = body.xpath("//div[@id='endText']//p") alist = [] for content in newContents: if content.text != None: alist.append(content.text) text = parseFromWin1252("<br><br>".join(alist)) values = [link, title, text, "Success"] cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values) else: #无标题 title = parseFromWin1252(linkTitle) print u"#NO_TITLE:", request.encoding, title, link values = [link, title, "", "NO_TITLE"] cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values) except TypeError: #报异常 title = parseFromWin1252(linkTitle) print u"$TypeError:", request.encoding, title, link values = [link, title, "", "TypeError"] cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values) #定义方法 def resolveAndSaveLinks(body, cursor): print u"解析html的Link" links = body.xpath("//ul[@class='mod-list main-list']//a") print u"处理数据" count = 1; for item in links: # 有em标签的无法解析 if item.text != None: values = [item.get("href"), item.text] cursor.execute("insert into links(url,text) value(%s,%s)", values) resolveAndSaveNewContentFromLink(item.get("href"), item.text, cursor) #time.sleep(100) #是否需要暂停,免得被封掉? print u"完成","<resolveAndSaveLinks>[%s:%s]" %(len(links), count) count = count + 1 print "----------------------------------------------------------" print u"保存数据完成,记录数[", len(links), "]" def resolveAndSaveEmInLinks(body, cursor): print u"解析html的包含em元素的Link" ems = body.xpath("//ul[@class='mod-list main-list']//em") print u"处理数据" count = 1; for item in ems: values = [item.getparent().get("href"), item.text] cursor.execute("insert into links(url,text) value(%s,%s)", values) resolveAndSaveNewContentFromLink(item.getparent().get("href"), item.text, cursor) #time.sleep(100) #是否需要暂停,免得被封掉? print u"完成","<resolveAndSaveEmInLinks>[%s:%s]" %(len(ems), count) count = count + 1 print "----------------------------------------------------------" print u"保存数据完成,记录数[", len(ems), "]" def resolve(): print u"打开链接" req = requests.get("http://news.163.com/") content = req.content dom = soupparser.fromstring(content) body = dom[1] print u"链接数据库" conn = MySQLdb.connect(host="192.168.0.196", user="root", passwd="", db="python", charset="utf8") cursor = conn.cursor() cursor.execute("delete from links") cursor.execute("delete from texts") #resolveAndSaveNewContentFromLink("http://auto.163.com/13/0929/02/99TGSGRJ00084TUR.html", u"测试", cursor) #if True: # return print u"解析并保存到数据库" #遍历不包含em标签的link resolveAndSaveLinks(body, cursor) #遍历包含em标签的link resolveAndSaveEmInLinks(body, cursor) cursor.close() conn.close() print u"遍历完成" #开始调用 resolve() end_datetime = datetime.datetime.now() print u"耗时", (end_datetime - start_datetime).seconds, u"秒" |
遍历糗事百科的文章,只遍历导航上面的几个分类,热门,最新,等等
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
Python代码 #!/usr/bin/ScanningQiuShiBaiKe.py # encoding=gbk import sys import os import MySQLdb import requests import datetime import time import lxml.html.soupparser as soupparser import lxml.etree as etree currentPageId = "currentPageId" def getImageFile(imgUrl): #文件下载,并写入本地硬盘,返回文件名 local_filename = imgUrl.split('/')[-1] local_filename= "/home/pandy/tmp/"+local_filename print u"下载文件成功: ", local_filename r = requests.get(imgUrl, stream=True) # here we need to set stream = True parameter with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() f.close() return local_filename return None def scannintArticle(cursor, type, url, article): #处理一个主题的信息 articleStr = etree.tostring(article) articleBody = soupparser.fromstring(articleStr) details = articleBody.xpath("//div[@class='detail']") authors = articleBody.xpath("//div[@class='author']") contents = articleBody.xpath("//div[@class='content']") thumbs = articleBody.xpath("//div[@class='thumb']") values = [type, url] if len(details) > 0: detailStr = etree.tostring(details[0]) detail = soupparser.fromstring(detailStr) values.append(detail.xpath("//a")[0].text) values.append(detail.xpath("//a")[0].get("href")) else: values.append("") values.append("") if len(authors) > 0: authorStr = etree.tostring(authors[0]) author = soupparser.fromstring(authorStr) values.append(author.xpath("//a")[0].text) values.append(author.xpath("//a")[0].get("href")) else: values.append("") values.append("") if len(contents) > 0: contentStr = etree.tostring(contents[0]) values.append(contents[0].text) else: values.append("") values.append("") if len(thumbs) > 0: thumbStr = etree.tostring(thumbs[0]) thumb = soupparser.fromstring(thumbStr) imgUrl = thumb.xpath("//img")[0].get("src") values.append(imgUrl) #下载图片,先临时存放,然后在读取出来保存到数据库,并删除 local_filename = getImageFile(imgUrl) f = open( local_filename , "rb" ) b = f.read() f.close() os.remove(local_filename) values.append(MySQLdb.Binary(b)) else: values.append("") values.append(None) values.append("Success") print values cursor.execute( "INSERT INTO qs_article ( type, url, detial_link, detail, user_link, user, content,img, img_content,status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", values) def scanning4typeArticle(cursor, type, url): #扫描一页 request = requests.get(url) #print request.encoding print url #print len(request.content) #print request.content try: dom = soupparser.fromstring(request.content) body = dom[1] #查找一页下面的主题 articleList = body.xpath("//div[@class='block untagged mb15 bs2']") for article in articleList: scannintArticle(cursor, type, url, article) except: print "Error" values = [type, url, '', '', '', '', '', '',None, "Error"] cursor.execute( "INSERT INTO qs_article ( type, url, detial_link, detail, user_link, user, content,img, img_content, status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", values) def scanning4type(cursor, type, url, subfix): #得到分页数,然后一页一页的打开 print u"开始扫描文章" request = requests.get(url); dom = soupparser.fromstring(request.content) body = dom[0] #得到底部分页的最大值 pagebars = body.xpath("//div[@class='pagebar']/a") if len(pagebars) > 2: maxPageSize = int(pagebars[len(pagebars) - 2].text) + 1 #一页一页的打开 for i in range(1, maxPageSize): scanningUrl = "".join([url, subfix]).replace(currentPageId, str(i)) scanning4typeArticle(cursor, type, scanningUrl) print u"扫描文章完成" def main(): # 主方法 #打开数据库 conn = MySQLdb.connect(host="192.168.0.196", user="root", passwd="", db="python", charset="utf8") cursor = conn.cursor() cursor.execute("delete from qs_article") #扫描几个类型,就是导航的前几个分类 scanning4type(cursor, "8HR", "http://www.qiushibaike.com/8hr", "".join(["/page/", "currentPageId", "?s=4602020"])) #scanning4type(cursor, "HOT", "http://www.qiushibaike.com/hot", "".join(["/page/", "currentPageId", "?s=4602057"])) #scanning4type(cursor, "IMGRANK", "http://www.qiushibaike.com/imgrank", "".join(["/page/", "currentPageId", "?s=4602057"])) #scanning4type(cursor, "LATE", "http://www.qiushibaike.com/late", "".join(["/page/", "currentPageId", "?s=4602057"])) #scanning4typeArticle(cursor, type, "http://www.qiushibaike.com/late/page/346?s=4602057") #关闭数据库 cursor.close() conn.close() #开始运行主程序 main() |
遍历新浪一些博客的图片,加入了访问频率控制
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
Python代码 #!/usr/bin/python # encoding=gbk #http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=3&type=2&blogid=67f899b332002zdw&ch= import sys import os import requests import MySQLdb import lxml.html.soupparser as soupparser import lxml.etree as etree import json import time maxPage = 100 # 定义被扫描的最大页数 requests.adapters.DEFAULT_RETRIES = 5 #加入控制打开频率 DEFAULT_OPEN_PAGE_FREQUENCY = 1 #打开页面的间隔事件 DEFAULT_OPEN_IMAGE_FREQUENCY = 3 #打开图片页面的间隔事件 DEFAULT_IMAGE_COUNT = 0 #图片计数器 DEFAULT_IMAGE_SIZE = 20 #打开size张图片后,要sleep DEFAULT_OPEN_IMAGE_FREQUENCY秒钟 def saveImage(title, imageSrc): # 保存图片 if title == None: title = u"无题" print u"标题:%s 图片:%s" % (title, imageSrc) dirStr = u"/mnt/E/新浪图集/" + title + "/" if not os.path.exists(dirStr): os.makedirs(dirStr) fileName = imageSrc.split('/')[-1] request = requests.get(imageSrc, stream=True) with open(dirStr + fileName, "wb") as file: for chunk in request.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks 5 file.write(chunk) file.flush() file.close() def listPicPage(pageUrl): #从首页打开链接,然后进行图片的页面 global DEFAULT_IMAGE_COUNT request = requests.get(pageUrl) dom = soupparser.fromstring(request.content) body = dom[1] title = body.xpath("//h3[@class='title']") titleStr = ""; if len(title) > 0: titleStr = title[0].text imageList = body.xpath("//div[@class='imgArea']/img[@class='qImg']") print u"遍历图片页面, 标题:%s, 地址: %s " % (titleStr, pageUrl) imageSrc = None for image in imageList: # 这里好像有两个地址,先用real_src,否在用src if image.get("real_src") != None: imageSrc = image.get("real_src") else: imageSrc = image.get("src") #要存在图片地址,才需要继续解析 if imageSrc != None: saveImage(titleStr, imageSrc) #访问频率控制 DEFAULT_IMAGE_COUNT = DEFAULT_IMAGE_COUNT + 1 if DEFAULT_IMAGE_COUNT % DEFAULT_IMAGE_SIZE == 0: print u"图片计数:%s, 休息 %s 秒钟后继续\n" % (DEFAULT_IMAGE_COUNT, DEFAULT_OPEN_IMAGE_FREQUENCY) time.sleep(DEFAULT_OPEN_IMAGE_FREQUENCY) def listPicIndex(): #遍历首页 # 根据页数来打开url for i in range(1, maxPage + 1): url = "http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=" + str( i) + "&type=2&blogid=67f899b332002zdw&ch=" request = requests.get(url) json_obj = json.loads(request.content) for item in json_obj["data"]["list"]: #找到这一页的所有图片链接,然后进行打开这个链接,才是显示图片的页面 dom = soupparser.fromstring(item) link = dom.xpath("//a[@class='pic']") if len(link) > 0: #遍历图片的页面 listPicPage(link[0].get("href")) print u"---------------------------------------------完成一个图片链接, 页数:", i #访问频率控制 # time.sleep(DEFAULT_OPEN_PAGE_FREQUENCY) print u"---------------------------------------------完成页数", maxPage, ":", i def main(): listPicIndex() #listPicPage("http://qing.blog.sina.com.cn/tj/a1509eee330044am.html") if __name__ == "__main__": main() |
上面的例子改成多线程
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
Python代码 #!/usr/bin/python # encoding=gbk #http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=3&type=2&blogid=67f899b332002zdw&ch= import sys import os import requests import MySQLdb import lxml.html.soupparser as soupparser import lxml.etree as etree import json import time import threading MAX_PAGE = 100 # 定义被扫描的最大页数 MAX_ERROR = 10 # 定义线程允许出现的最大错误数,当不超过这个数字的时候,会自动继续重试 PAGE_SIZE = 5 #段数 DEFAULT_OPEN_PAGE_FREQUENCY = 2 #完成一页休眠的时间 DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY = 5 #出现异常之后等待重试的事件 requests.adapters.DEFAULT_RETRIES = 5 def saveImage(thName, title, imageSrc, currentPath): # 保存图片 if title == None: title = u"无题" print u"线程名称:%s, 页码:%s, 标题:%s 图片:%s" % (thName, currentPath, title, imageSrc) dirStr = u"/mnt/E/新浪图集/" + title + "/" if not os.path.exists(dirStr): os.makedirs(dirStr) fileName = imageSrc.split('/')[-1] request = requests.get(imageSrc, stream=True) with open(dirStr + fileName, "wb") as file: for chunk in request.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks 5 file.write(chunk) file.flush() file.close() def listPicPage(thName, pageUrl, currentPath): #从首页打开链接,然后进行图片的页面 global DEFAULT_IMAGE_COUNT request = requests.get(pageUrl) dom = soupparser.fromstring(request.content) body = dom[1] title = body.xpath("//h3[@class='title']") titleStr = ""; if len(title) > 0: titleStr = title[0].text imageList = body.xpath("//div[@class='imgArea']/img[@class='qImg']") #print u"\n\n页码:%s, 遍历图片页面, 标题:%s, 地址: %s " % (currentPath, titleStr, pageUrl) imageSrc = None for image in imageList: # 这里好像有两个地址,先用real_src,否在用src if image.get("real_src") != None: imageSrc = image.get("real_src") else: imageSrc = image.get("src") #要存在图片地址,才需要继续解析 if imageSrc != None: saveImage(thName, titleStr, imageSrc, currentPath) def listPicIndex(thName, startPath, endPath): #遍历首页 # 根据页数来打开url for i in range(startPath, endPath + 1): url = "http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=" + str( i) + "&type=2&blogid=67f899b332002zdw&ch=" print url request = requests.get(url) json_obj = json.loads(request.content) error_count = 0 for item in json_obj["data"]["list"]: #找到这一页的所有图片链接,然后进行打开这个链接,才是显示图片的页面 dom = soupparser.fromstring(item) link = dom.xpath("//a[@class='pic']") if len(link) > 0: #遍历图片的页面 try: listPicPage(thName, link[0].get("href"), i) except: if error_count < MAX_ERROR: error_count = error_count + 1 #错先错误的话,等待一会儿,再重试 print u"---------------------------------------------休眠%s秒钟后重试, 页数:%s" % ( DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY, i) time.sleep(DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY) listPicPage(thName, link[0].get("href"), i) else: print u"出错超过预设次数,退出爬虫。" #print u"---------------------------------------------完成一个图片链接, 页数:", i #访问频率控制 time.sleep(DEFAULT_OPEN_PAGE_FREQUENCY) print u"---------------------------------------------完成页数", MAX_PAGE, ":", i return True class MyThread(threading.Thread): def __init__(self, name, startPath, endPage): threading.Thread.__init__(self) self.name = name self.is_stop = False self.startPage = startPath self.endPage = endPage def run(self): while not self.is_stop: #遍历完成后停止线程 self.is_stop = listPicIndex(self.name, self.startPage, self.endPage) def stop(self): #手动设置停止标记 self.is_stop = True if __name__ == "__main__": #分段创建线程 count=1; for i in range(1, MAX_PAGE, PAGE_SIZE): startPath = i endPath = i + PAGE_SIZE if endPath > MAX_PAGE: endPath = MAX_PAGE print startPath, ",", endPath t = MyThread("Thread " + str(count), startPath, endPath) count=count+1 t.start() pass |