1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 |
#! /usr/bin/python3 ''' 先看网站的页面大概结构然后得到做法 功能说明: 从 baseUrl 下载所有图表 1. 打开 baseUrl/categories , 拿到所有分类 2. 进入 baseUrl/category/1 , 得到这个分类下面的文件夹, 注意翻页 4. 进入分类 baseUrl/icon/101003 , 下载封面图片,然后看下面部分, 进入每个页面,下载图片 /usr/bin/python3 /home/pandy/workspace/idea/PythonAppTest/com/pandy/Spider/IconPngDownload.py ''' ''' CREATE TABLE DOWNLOAD_FROM_ICONPNG_CAT ( CAT_ID bigint NOT NULL AUTO_INCREMENT, CAT_NAME VARCHAR(100), CREATE_DATE DATETIME, PRIMARY KEY (CAT_ID), INDEX INDEX_2 (CAT_NAME), CONSTRAINT INDEX_1 UNIQUE (CAT_NAME) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE DOWNLOAD_FROM_ICONPNG_SET ( SET_ID bigint NOT NULL AUTO_INCREMENT, CAT_ID bigint, SET_NAME VARCHAR(100), SET_URL VARCHAR(100), PNG_TOTAL bigint, CREATE_DATE DATETIME, PRIMARY KEY (SET_ID), CONSTRAINT FK_DOWNLOAD_FROM_ICONPNG_CAT FOREIGN KEY (CAT_ID) REFERENCES `DOWNLOAD_FROM_ICONPNG_CAT` (`CAT_ID`) , CONSTRAINT INDEX_1 UNIQUE (SET_URL), INDEX INDEX_2 (CAT_ID, SET_NAME) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE DOWNLOAD_FROM_ICONPNG_PNG ( PNG_ID bigint NOT NULL AUTO_INCREMENT, SET_ID bigint, PNG_NAME VARCHAR(100), PNG_URL VARCHAR(100), CREATE_DATE DATETIME, PRIMARY KEY (PNG_ID), CONSTRAINT FK_DOWNLOAD_FROM_ICONPNG_SET FOREIGN KEY (SET_ID) REFERENCES `DOWNLOAD_FROM_ICONPNG_SET` (`SET_ID`) , CONSTRAINT INDEX_1 UNIQUE (SET_ID,PNG_URL), INDEX INDEX_2 (PNG_NAME) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; ''' import os import time import urllib from urllib.request import urlopen, urlretrieve, Request from lxml import etree import pymysql import socket import logging import threading import random exitFlag = 0 baseUrl = 'http://www.xxx.com' baseDir = "/mnt/E/3_技术资料/我的图标" logging.basicConfig(level=logging.INFO) def insertSetInfoMYSQL(db, cursor, catId, setName, setUrl, ct): sql = "INSERT INTO DOWNLOAD_FROM_ICONPNG_SET ( CAT_ID, SET_NAME, SET_URL, PNG_TOTAL, CREATE_DATE) " \ " VALUES ('" + str(catId) + "', '" + setName + "', '" + setUrl + "', " + str(ct) + ", '" + time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) + "');" # logging.debug(sql) cursor.execute(sql) return db.insert_id() def isExistSetInfoMYSQL(cursor, catId, setUrl): isFound = False sql = "SELECT * FROM DOWNLOAD_FROM_ICONPNG_SET where CAT_ID=" + str(catId) + " AND SET_URL='" + setUrl + "'" cursor.execute(sql) list = cursor.fetchall() for row in list: isFound = True break return isFound def savePngInfoMYSQL(db, cursor, setId, pngName, pngUrl): isFound = False sql = "SELECT PNG_ID FROM DOWNLOAD_FROM_ICONPNG_PNG where SET_ID=" + str( setId) + " and PNG_NAME='" + pngName + "' AND PNG_URL='" + pngUrl + "'" cursor.execute(sql) list = cursor.fetchall() for row in list: isFound = True break if isFound: return sql = "INSERT INTO DOWNLOAD_FROM_ICONPNG_PNG ( SET_ID,PNG_NAME, PNG_URL, CREATE_DATE) " \ " VALUES (" + str(setId) + ",'" + pngName + "','" + pngUrl + "', '" \ + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "');" # logging.debug(sql) cursor.execute(sql) def saveCatInfoMYSQL(db, cursor, catName): catId = 0 isFound = False sql = "SELECT CAT_ID,CAT_NAME FROM DOWNLOAD_FROM_ICONPNG_CAT where CAT_NAME='" + catName + "'" cursor.execute(sql) list = cursor.fetchall() for row in list: catId = row[0] isFound = True break if isFound: return catId sql = "INSERT INTO DOWNLOAD_FROM_ICONPNG_CAT ( CAT_NAME, CREATE_DATE) " \ " VALUES ('" + catName + "', '" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "');" cursor.execute(sql) catId = db.insert_id() return catId def mkdirs(path): if not os.path.exists(path): os.makedirs(path) def getHttpReq(url): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} req = urllib.request.Request(url=url, headers=headers) return req # 下载HTML def getHtml(url): html = urllib.request.urlopen(getHttpReq(url)).read() return html.decode("gbk") def parseXPath(url): html = getHtml(url) html_data = etree.HTML(html) return html_data # 返回一个类型的字典 def getCategories(url): logging.info("=======================获得所有分类============================") list = [] xpath = parseXPath(url) result = xpath.xpath('/html//td[@class="contentt"]//a[@class="category-text"]') for r in result: list.append([r.text, r.attrib["href"]]) pass return list def getAllCatPageUrls(threadID,title, href): logging.info("threadID:["+str(threadID)+"]=======================获得一个分类下面的所有分页地址:" + title) list = [] url = baseUrl + href xpath = parseXPath(url) # 先得到分页的起始数字,然后在遍历 sIndex = 1 # 得到下一页按钮前面的链接文本 result = xpath.xpath('/html//div[@class="paging"]//a[last()-1]') eIndex = int(result[0].text) for i in range(sIndex, eIndex + 1): pageUrl = href + "/" + str(i) list.append(pageUrl) return list # 获得一个分类页下面的集合 def getAllCatPageIconSet(threadID,url): logging.info("threadID:["+str(threadID)+"]=======================处理每个页面下的集合============================") url = baseUrl + url list = [] xpath = parseXPath(url) result = xpath.xpath('/html//div[@class="div_listtitle"]/a[@class="listtitle"]') for r in result: list.append([r.text, r.attrib["href"]]) return list # 拿到所有png的url,然后在下载图片,进入的url是第一个图片 def getSetPngList(threadID,text, url): logging.info("threadID:["+str(threadID)+"]=======================处理分类下面的分页的集合信息: " + text + ", " + url) url = baseUrl + url list = [] xpath = parseXPath(url) # 解析首页图片 result = xpath.xpath('/html//td[@class="icondetailstdicon"]/div/img') # 第一个就是png的下载地址 txt = result[0].attrib["title"] result = xpath.xpath('/html//td[@class="icondetailstdicon"]/a[@class="blocklink"]') # 第一个就是png的下载地址 htef = result[0].attrib["href"] list.append([txt, htef]) # 解析列表图片 titleList = [] imgs = xpath.xpath('/html//div[@id="iconlist"]//div[@class="icon"]//div[@class="iconlinks"]/a/img') # 第一个就是png的下载地址 for img in imgs: titleList.append(img.attrib["title"]) hrefList = [] hrefs = xpath.xpath( '/html//div[@id="iconlist"]//div[@class="icon"]//div[@class="iconlinks"]//div[@class="downloadlinks"]/a[1]') # 第一个就是png的下载地址 for href in hrefs: hrefList.append(href.attrib["href"]) # TODO:如果不是Png格式,怎么处理? for i in range(len(hrefList)): list.append([titleList[i], hrefList[i]]) return list def downloadPng(imgDir, text, url): try: urlretrieve(baseUrl + url, imgDir + '/' + '%s.png' % text) logging.debug("成功下载图片:" + baseUrl + url + " " + text) return False except: logging.error("不能下载图片:" + baseUrl + url + " " + text) return True def processCategory(threadID, cat, catId): db = pymysql.connect(host='192.168.0.222', port=3306, user='pandy', passwd='pandy', db='test', charset='utf8') logging.info("===>threadID:["+str(threadID)+"]处理一个分类:" + cat[0] + " => " + str(threadID)) mkdirs(baseDir + "/" + cat[0]) # 获得一个分类下面的所有分页 pageUrlList = getAllCatPageUrls(threadID,cat[0], cat[1]) # 一个分类一个连接 cursor = db.cursor() for pageUrl in pageUrlList: logging.info("threadID:["+str(threadID)+"]===========================================分页地址:" +pageUrl) time.sleep(random.choice(range(5))) #随即暂停 # 变了一个分页下面的所有集合 setList = getAllCatPageIconSet(threadID,pageUrl) for pngSet in setList: imgDir = baseDir + "/" + cat[0] + "/" + pngSet[0] mkdirs(imgDir) # 如果这个集合下载过,则跳过 # 依据是这个集合的首页地址是否存在 if isExistSetInfoMYSQL(cursor, catId, pngSet[1]): logging.warn("threadID:["+str(threadID)+"]♣♣♣♣♣--------------> 已经下载过:" + pngSet[0]) continue # 获得一个集合下面的所有图片集 downUrlList = getSetPngList(threadID,pngSet[0], pngSet[1]) isTimeout = False time.sleep(random.choice(range(5))) #随即暂停 for durl in downUrlList: isTimeout = downloadPng(imgDir, durl[0], durl[1]) if isTimeout == True: break # 如果下载完了,就记录到数据库 if isTimeout == False: # 确定这个集合已经下载,避免下次重复下载 setId = insertSetInfoMYSQL(db, cursor, catId, pngSet[0], pngSet[1], len(downUrlList)) logging.info("threadID:["+str(threadID)+"]♥♥♥♥♥===========================================下载完成一个集合:" + pngSet[0]) for durl in downUrlList: savePngInfoMYSQL(db, cursor, setId, durl[0], durl[1]) # 一个集合一次提交 db.commit() time.sleep(5) else: logging.error("threadID:["+str(threadID)+"]下载图片超时:" + cat[0] + "[" + pngSet[0] + "]") db.rollback() # break # 测试一次 # break # 测试一次 cursor.close() db.close() class myThread(threading.Thread): # 构造函数,初始化这个类 def __init__(self, threadID, name, cat, catId): threading.Thread.__init__(self) self.threadID = threadID self.name = name self.cat = cat self.catId = catId def run(self): logging.info("------------------------------------------->开始下载线程:" + self.name) # 开始处理分类下面的分页 processCategory(self.threadID, self.cat, self.catId) logging.info("-------退出下载线程:" + self.name) # 主函数 def main(): db = pymysql.connect(host='192.168.0.222', port=3306, user='pandy', passwd='pandy', db='test', charset='utf8') cursor = db.cursor() socket.setdefaulttimeout(20) url = baseUrl + '/categories' # 获得整个网站的图片分类 categories = getCategories(url) # 创建基础目录 mkdirs(baseDir) count = 0 for cat in categories: catId = saveCatInfoMYSQL(db, cursor, cat[0]) db.commit() print("catId=" + str(catId)) count = count + 1 # 打开线程,执行一个分类 thread = myThread(count, cat[0], cat, catId) thread.start() # thread.join() #实际上意味着等到队列为空,再执行别的操作 # break # 测试一次 cursor.close() if __name__ == '__main__': main() |