自己没有测试成功,但可以找到入口。 原文参考: http://blog.csdn.net/warison200…
不静之心

分类:专业技术
CSS3 radial-gradient径向渐变语法及辅助理解案例10则
参考: http://www.zhangxinxu.com/wordpress/2017/11/css3-ra…
mysql的一些日志设定
更详细的信息参考: http://pangge.blog.51cto.com/6013757/1319304 …
python爬虫,下载图片
|
#! /usr/bin/python3 ''' 先看网站的页面大概结构然后得到做法 功能说明: 从 baseUrl 下载所有图表 1. 打开 baseUrl/categories , 拿到所有分类 2. 进入 baseUrl/category/1 , 得到这个分类下面的文件夹, 注意翻页 4. 进入分类 baseUrl/icon/101003 , 下载封面图片,然后看下面部分, 进入每个页面,下载图片 /usr/bin/python3 /home/pandy/workspace/idea/PythonAppTest/com/pandy/Spider/IconPngDownload.py ''' ''' CREATE TABLE DOWNLOAD_FROM_ICONPNG_CAT ( CAT_ID bigint NOT NULL AUTO_INCREMENT, CAT_NAME VARCHAR(100), CREATE_DATE DATETIME, PRIMARY KEY (CAT_ID), INDEX INDEX_2 (CAT_NAME), CONSTRAINT INDEX_1 UNIQUE (CAT_NAME) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE DOWNLOAD_FROM_ICONPNG_SET ( SET_ID bigint NOT NULL AUTO_INCREMENT, CAT_ID bigint, SET_NAME VARCHAR(100), SET_URL VARCHAR(100), PNG_TOTAL bigint, CREATE_DATE DATETIME, PRIMARY KEY (SET_ID), CONSTRAINT FK_DOWNLOAD_FROM_ICONPNG_CAT FOREIGN KEY (CAT_ID) REFERENCES `DOWNLOAD_FROM_ICONPNG_CAT` (`CAT_ID`) , CONSTRAINT INDEX_1 UNIQUE (SET_URL), INDEX INDEX_2 (CAT_ID, SET_NAME) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE DOWNLOAD_FROM_ICONPNG_PNG ( PNG_ID bigint NOT NULL AUTO_INCREMENT, SET_ID bigint, PNG_NAME VARCHAR(100), PNG_URL VARCHAR(100), CREATE_DATE DATETIME, PRIMARY KEY (PNG_ID), CONSTRAINT FK_DOWNLOAD_FROM_ICONPNG_SET FOREIGN KEY (SET_ID) REFERENCES `DOWNLOAD_FROM_ICONPNG_SET` (`SET_ID`) , CONSTRAINT INDEX_1 UNIQUE (SET_ID,PNG_URL), INDEX INDEX_2 (PNG_NAME) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; ''' import os import time import urllib from urllib.request import urlopen, urlretrieve, Request from lxml import etree import pymysql import socket import logging import threading import random exitFlag = 0 baseUrl = 'http://www.xxx.com' baseDir = "/mnt/E/3_技术资料/我的图标" logging.basicConfig(level=logging.INFO) def insertSetInfoMYSQL(db, cursor, catId, setName, setUrl, ct): sql = "INSERT INTO DOWNLOAD_FROM_ICONPNG_SET ( CAT_ID, SET_NAME, SET_URL, PNG_TOTAL, CREATE_DATE) " \ " VALUES ('" + str(catId) + "', '" + setName + "', '" + setUrl + "', " + str(ct) + ", '" + time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) + "');" # logging.debug(sql) cursor.execute(sql) return db.insert_id() def isExistSetInfoMYSQL(cursor, catId, setUrl): isFound = False sql = "SELECT * FROM DOWNLOAD_FROM_ICONPNG_SET where CAT_ID=" + str(catId) + " AND SET_URL='" + setUrl + "'" cursor.execute(sql) list = cursor.fetchall() for row in list: isFound = True break return isFound def savePngInfoMYSQL(db, cursor, setId, pngName, pngUrl): isFound = False sql = "SELECT PNG_ID FROM DOWNLOAD_FROM_ICONPNG_PNG where SET_ID=" + str( setId) + " and PNG_NAME='" + pngName + "' AND PNG_URL='" + pngUrl + "'" cursor.execute(sql) list = cursor.fetchall() for row in list: isFound = True break if isFound: return sql = "INSERT INTO DOWNLOAD_FROM_ICONPNG_PNG ( SET_ID,PNG_NAME, PNG_URL, CREATE_DATE) " \ " VALUES (" + str(setId) + ",'" + pngName + "','" + pngUrl + "', '" \ + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "');" # logging.debug(sql) cursor.execute(sql) def saveCatInfoMYSQL(db, cursor, catName): catId = 0 isFound = False sql = "SELECT CAT_ID,CAT_NAME FROM DOWNLOAD_FROM_ICONPNG_CAT where CAT_NAME='" + catName + "'" cursor.execute(sql) list = cursor.fetchall() for row in list: catId = row[0] isFound = True break if isFound: return catId sql = "INSERT INTO DOWNLOAD_FROM_ICONPNG_CAT ( CAT_NAME, CREATE_DATE) " \ " VALUES ('" + catName + "', '" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "');" cursor.execute(sql) catId = db.insert_id() return catId def mkdirs(path): if not os.path.exists(path): os.makedirs(path) def getHttpReq(url): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} req = urllib.request.Request(url=url, headers=headers) return req # 下载HTML def getHtml(url): html = urllib.request.urlopen(getHttpReq(url)).read() return html.decode("gbk") def parseXPath(url): html = getHtml(url) html_data = etree.HTML(html) return html_data # 返回一个类型的字典 def getCategories(url): logging.info("=======================获得所有分类============================") list = [] xpath = parseXPath(url) result = xpath.xpath('/html//td[@class="contentt"]//a[@class="category-text"]') for r in result: list.append([r.text, r.attrib["href"]]) pass return list def getAllCatPageUrls(threadID,title, href): logging.info("threadID:["+str(threadID)+"]=======================获得一个分类下面的所有分页地址:" + title) list = [] url = baseUrl + href xpath = parseXPath(url) # 先得到分页的起始数字,然后在遍历 sIndex = 1 # 得到下一页按钮前面的链接文本 result = xpath.xpath('/html//div[@class="paging"]//a[last()-1]') eIndex = int(result[0].text) for i in range(sIndex, eIndex + 1): pageUrl = href + "/" + str(i) list.append(pageUrl) return list # 获得一个分类页下面的集合 def getAllCatPageIconSet(threadID,url): logging.info("threadID:["+str(threadID)+"]=======================处理每个页面下的集合============================") url = baseUrl + url list = [] xpath = parseXPath(url) result = xpath.xpath('/html//div[@class="div_listtitle"]/a[@class="listtitle"]') for r in result: list.append([r.text, r.attrib["href"]]) return list # 拿到所有png的url,然后在下载图片,进入的url是第一个图片 def getSetPngList(threadID,text, url): logging.info("threadID:["+str(threadID)+"]=======================处理分类下面的分页的集合信息: " + text + ", " + url) url = baseUrl + url list = [] xpath = parseXPath(url) # 解析首页图片 result = xpath.xpath('/html//td[@class="icondetailstdicon"]/div/img') # 第一个就是png的下载地址 txt = result[0].attrib["title"] result = xpath.xpath('/html//td[@class="icondetailstdicon"]/a[@class="blocklink"]') # 第一个就是png的下载地址 htef = result[0].attrib["href"] list.append([txt, htef]) # 解析列表图片 titleList = [] imgs = xpath.xpath('/html//div[@id="iconlist"]//div[@class="icon"]//div[@class="iconlinks"]/a/img') # 第一个就是png的下载地址 for img in imgs: titleList.append(img.attrib["title"]) hrefList = [] hrefs = xpath.xpath( '/html//div[@id="iconlist"]//div[@class="icon"]//div[@class="iconlinks"]//div[@class="downloadlinks"]/a[1]') # 第一个就是png的下载地址 for href in hrefs: hrefList.append(href.attrib["href"]) # TODO:如果不是Png格式,怎么处理? for i in range(len(hrefList)): list.append([titleList[i], hrefList[i]]) return list def downloadPng(imgDir, text, url): try: urlretrieve(baseUrl + url, imgDir + '/' + '%s.png' % text) logging.debug("成功下载图片:" + baseUrl + url + " " + text) return False except: logging.error("不能下载图片:" + baseUrl + url + " " + text) return True def processCategory(threadID, cat, catId): db = pymysql.connect(host='192.168.0.222', port=3306, user='pandy', passwd='pandy', db='test', charset='utf8') logging.info("===>threadID:["+str(threadID)+"]处理一个分类:" + cat[0] + " => " + str(threadID)) mkdirs(baseDir + "/" + cat[0]) # 获得一个分类下面的所有分页 pageUrlList = getAllCatPageUrls(threadID,cat[0], cat[1]) # 一个分类一个连接 cursor = db.cursor() for pageUrl in pageUrlList: logging.info("threadID:["+str(threadID)+"]===========================================分页地址:" +pageUrl) time.sleep(random.choice(range(5))) #随即暂停 # 变了一个分页下面的所有集合 setList = getAllCatPageIconSet(threadID,pageUrl) for pngSet in setList: imgDir = baseDir + "/" + cat[0] + "/" + pngSet[0] mkdirs(imgDir) # 如果这个集合下载过,则跳过 # 依据是这个集合的首页地址是否存在 if isExistSetInfoMYSQL(cursor, catId, pngSet[1]): logging.warn("threadID:["+str(threadID)+"]♣♣♣♣♣--------------> 已经下载过:" + pngSet[0]) continue # 获得一个集合下面的所有图片集 downUrlList = getSetPngList(threadID,pngSet[0], pngSet[1]) isTimeout = False time.sleep(random.choice(range(5))) #随即暂停 for durl in downUrlList: isTimeout = downloadPng(imgDir, durl[0], durl[1]) if isTimeout == True: break # 如果下载完了,就记录到数据库 if isTimeout == False: # 确定这个集合已经下载,避免下次重复下载 setId = insertSetInfoMYSQL(db, cursor, catId, pngSet[0], pngSet[1], len(downUrlList)) logging.info("threadID:["+str(threadID)+"]♥♥♥♥♥===========================================下载完成一个集合:" + pngSet[0]) for durl in downUrlList: savePngInfoMYSQL(db, cursor, setId, durl[0], durl[1]) # 一个集合一次提交 db.commit() time.sleep(5) else: logging.error("threadID:["+str(threadID)+"]下载图片超时:" + cat[0] + "[" + pngSet[0] + "]") db.rollback() # break # 测试一次 # break # 测试一次 cursor.close() db.close() class myThread(threading.Thread): # 构造函数,初始化这个类 def __init__(self, threadID, name, cat, catId): threading.Thread.__init__(self) self.threadID = threadID self.name = name self.cat = cat self.catId = catId def run(self): logging.info("------------------------------------------->开始下载线程:" + self.name) # 开始处理分类下面的分页 processCategory(self.threadID, self.cat, self.catId) logging.info("-------退出下载线程:" + self.name) # 主函数 def main(): db = pymysql.connect(host='192.168.0.222', port=3306, user='pandy', passwd='pandy', db='test', charset='utf8') cursor = db.cursor() socket.setdefaulttimeout(20) url = baseUrl + '/categories' # 获得整个网站的图片分类 categories = getCategories(url) # 创建基础目录 mkdirs(baseDir) count = 0 for cat in categories: catId = saveCatInfoMYSQL(db, cursor, cat[0]) db.commit() print("catId=" + str(catId)) count = count + 1 # 打开线程,执行一个分类 thread = myThread(count, cat[0], cat, catId) thread.start() # thread.join() #实际上意味着等到队列为空,再执行别的操作 # break # 测试一次 cursor.close() if __name__ == '__main__': main() |
python的xpath
Python爬虫利器三之Xpath语法与lxml库的用法 http://cuiqingcai.com/2621…
python的Log使用
转: http://www.jianshu.com/p/feb86c06c4f4 默认情况下,logging模…
python表单提交
参考: Python3 模拟登录知乎(requests) http://www.jianshu.com/p/7…
python执行shell命令需要sudo密码
os.system方式
1 2 3 |
sudoPassword = 'pandy' command = '/opt/lampp/lampp stopmysql' str = os.system('echo %s|sudo -S %s' % (sudoPassword, command)) |
&n…
python3 连接 mysql 遇到乱码问题解决方案
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
#! /usr/bin/python3 import pymysql print("开始数据库操作") #不指定字符连接 #db = pymysql.connect("192.168.0.222","pandy","******","pandy_psi") #指定使用utf8字符来连接数据库,避免乱码问题 db = pymysql.connect(host='192.168.0.222', port=3306, user='pandy', passwd='******',db='pandy_psi',charset='utf8') cursor = db.cursor() sql="SELECT CLIENT_ID,NAME ,ADDR ,STAFF_NAME ,TEL ,MOBILE FROM PSI_CLIENT" cursor.execute(sql) list = cursor.fetchall() for row in list: print("ID=%s, NAME=%s, ADDR=%s"%(row[0],row[1],row[2])) db.close() |
postgresql 获取表属性结构信息
postgresql 获取表属性结构信息 https://my.oschina.net/longtian/bl…
shiro整合ehcache AND SessionManager
shiro整合ehcache AND SessionManager https://my.oschina.ne…
MySQL 获取两个日期之间的日期列表
https://my.oschina.net/mobinchao/blog/1540015 [crayon-6…
H5移动端知识点总结
H5移动端知识点总结 https://my.oschina.net/u/3689829/blog/154063…
MySQL 对于千万级的大表要怎么优化
MySQL 对于千万级的大表要怎么优化 https://www.zhihu.com/question/1971…
使用jquery获取css的top和left属性
使用jquery获取css的top和left属性 http://www.jquerycn.cn/a_85 经常…
Spring Mybatis 二级缓存整合
springMVC+mybatis+ehcache详细配置 https://my.oschina.net/u/…
定时任务Quartz的调用例子
TriggerListener和JobListener https://nkcoder.github.io/2…
SpringMVC整合fastdfs-client-java实现web文件上传下载
SpringMVC整合fastdfs-client-java实现web文件上传下载 https://my.os…
Servlet 3 新特性详解
Servlet 3.0 新特性详解 https://www.ibm.com/developerworks/cn…
Apache POI .xlsx 下拉框实现
Apache POI .xlsx 下拉框实现 https://my.oschina.net/u/3035165…
Apache反向代理对WebSocket“不支持”的解决方案
Apache反向代理对WebSocket“不支持”的解决方案 https://segmentfault.com…
Spring跟Drools整合
基于Spring + Drools6.4规则引擎代码实例 https://my.oschina.net/xie…
Spring从txt文件注入到bean属性
1 |
<bean id="ProductUtils" class="cn.com.voge.utils.ProductUtils"/> |
[crayon-6815f79d2ae150…
Jsoup解析Html中文文档
Jsoup解析Html中文文档 http://www.cnblogs.com/jycboy/p/jsoupdo…
Web Notifications API
Web Notifications API http://javascript.ruanyifeng.com/…
Spring 3.x使用websocket, Tomcat7, 原声Websocket
Chrome,Tomcat7、Chrome下web socket的一个例子 http://tyrion.ite…
汉字验证码
汉字验证码 https://my.oschina.net/u/2742034/blog/1499202 [cr…
如何写一个jQuery插件:扩展jQuery的对象
教程1 https://gist.github.com/quexer/3619237 创建一个自定义 jQue…
SpringMVC拦截器
SpringMVC拦截器 https://my.oschina.net/dreambreeze/blog/14…
JavaScript实现十种经典排序算法
http://www.htmleaf.com/ziliaoku/qianduanjiaocheng/20170…
Spring AOP @Aspect 基本用法
https://my.oschina.net/u/3218855/blog/1439368 Spring使用的…
JAVA实现SFTP上传,下载,删除
JAVA实现SFTP上传,下载,删除 https://my.oschina.net/u/2447394/blo…
Servlet3.0 新特性——HttpServletRequest 对文件上传的支持
Servlet3.0 新特性——HttpServletRequest 对文件上传的支持 https://my….
Java中的锁-Lock接口解析
http://blog.csdn.net/canot/article/details/52050633 提到J…
一步步完成FastDFS + Spring MVC上传下载整合示例
一步步完成FastDFS + Spring MVC上传下载整合示例 https://my.oschina.ne…