本文共 8171 字,大约阅读时间需要 27 分钟。
5.1 媒体文件简述
网络上的资源很多,有图片,视频,常规文件rar\zip等,由于网络爬去的数据量大,如果直接保存,相对只保存对应的链接字符串,有很多缺陷:1、由于下载,导致爬取速度慢;2、消耗存储空间;3、而且还要实现文件下载的方法,繁琐;优点:1、防止由于盗链改变导致的信息丢失(盗链即网页内部通往外部网页的链接),盗链所连接的信息可能变化
简单地文件下载
# 1、单个文件下载from urllib.request import urlretrievefrom urllib.request import urlopenfrom bs4 import BeautifulSoup'''爬去网页示例内容 '''html = urlopen("http://www.pythonscraping.com")bsObj = BeautifulSoup(html,"html5lib")print(bsObj)# 注意这里find格式映射关系imageLocation = bsObj.find("a",{ "id":"logo"}).find("img")["src"]print(imageLocation)# 下载图片urlretrieve(imageLocation,"img01.jpg")
# 批量爬取文件并下载import osfrom urllib.request import urlopenfrom urllib.request import urlretrievefrom bs4 import BeautifulSoup'''爬去网页示例内容 '''# 为何这里写的如此复杂?# 在于我们希望将下载的资源根据所在的主url,来进行相对应的文件夹baseurl = "http://pythonscraping.com"downloadDirectory = "download"html = urlopen("http://www.pythonscraping.com")bsObj = BeautifulSoup(html, "html5lib")downloadList = bsObj.findAll(src=True)print(len(downloadList))# 获取文件夹绝对路径的urldef getAbsoluteURL(baseUrl, downloadUrl): url = "" if downloadUrl.startswith("http://www."): url = "http://" + downloadUrl[11:] if baseurl not in url: return None return url# 获取文件夹绝对路径(用于地址保存)def getDownloadPath(baseurl, fileurl, downloadDirectory): # 含有?的需要删除?之后的字符串(下同) if "?" in fileurl: endIndex = fileurl.index("?") fileurl = fileurl[:endIndex] path = fileurl.replace(baseurl, "") path = downloadDirectory + path directory = os.path.dirname(path) if not os.path.exists(directory): os.makedirs(directory) return pathfor download in downloadList: fileurl = getAbsoluteURL(baseurl, download["src"]) # 'NoneType' object has no attribute 'replace' if fileurl is not None: if "?" in fileurl: endIndex = fileurl.index("?") fileurl = fileurl[:endIndex] print(fileurl) urlretrieve(fileurl, getDownloadPath(baseurl, fileurl, downloadDirectory))
5.2 把数据存储到csv中
这里的意思是将Html中的表格数据存储到csv表格中。1、找到一个table标签及其对应的id;2、找到所有的tr标签,即所有的行信息;3、遍历所有的行信息,找到每行对应的td或者th标签,写入csv中
# 例1:简单地csv文件写入# 文件如果不存在会自动创建import csvcsvFile = open("files/test.csv","w+")try: writer = csv.writer(csvFile) writer.writerow(('number1','number2','number3')) for i in range(10): writer.writerow((i,i+2,i*2))finally: csvFile.close()# 例2,网页信息写入csvfrom urllib.request import urlopenfrom bs4 import BeautifulSoupimport codecshtml = urlopen("https://baike.baidu.com/item/%E5%9B%BD%E5%86%85%E7%94%9F%E4%BA%A7%E6%80%BB%E5%80%BC/31864?fromtitle=GDP&fromid=41201")bsObj = BeautifulSoup(html,"html5lib")# print(bsObj)table = bsObj.findAll("table",{ "class":"table-view log-set-param"})[1]rows = table.findAll("tr")'''excel打开csv文件,可以识别编码“GB2312”,但是不能识别“utf-8”,数据库里的字符串编码是utf-8.因此:当从csv读取数据(data)到数据库的时候,需要先把GB2312转换为unicode编码,然后再把unicode编码转换为utf-8编码:data.decode('GB2312').encode('utf-8')当从数据库读取数据(data)存到csv文件的时候,需要先把utf-8编码转换为unicode编码,然后再把unicode编码转换为GB2312编码:data.decode('utf-8').encode('GB2312')'''# windows下使用GB2312# linux下使用utf-8csvFile = open("files/gdpList.csv",'wt',newline='',encoding="GB2312")writer = csv.writer(csvFile)# 另外注意pycharm中打开csv文件肯能感觉是乱的,其实就是以逗号分开的,没有问题try: for row in rows: csvRow = [] for cell in row.findAll('td'): csvRow.append(cell.getText()) writer.writerow(csvRow)finally: csvFile.close()
实验结果:
5.3 python连接mysql
这里的连接和java都是一个道理,但是相比java要简便很多。安装pymysql库后自动导入就可以使用pymysql
'''# 1、简单实验:import pymysqlconn = pymysql.Connect("localhost","root","root","pythontes")cur = conn.cursor()cur.execute("select * from pages")# fetchone():"""Fetch the next row"""# fetchall():"""Fetch all the rows"""for row in cur.fetchall(): print(row) cur.close()conn.close()# 以上输出为:# ('a1', 'liuBei', '123')# ('a2', 'guanYu', '123')# ('a3', 'zhangSanFei', '123')'''# 2、网页保存到mysql# 数据库存储网页信息# 还是上次那个GDP排名数据from urllib.request import urlopenfrom bs4 import BeautifulSoupimport codecsimport pymysqlhtml = urlopen("https://baike.baidu.com/item/%E5%9B%BD%E5%86%85%E7%94%9F%E4%BA%A7%E6%80%BB%E5%80%BC/31864?fromtitle=GDP&fromid=41201")bsObj = BeautifulSoup(html,"html5lib")# print(bsObj)table = bsObj.findAll("table",{ "class":"table-view log-set-param"})[1]rows = table.findAll("tr")# 输出到数据库mysqlconn = pymysql.connect("localhost","root","root","pythontes")# 错误记录1:UnicodeEncodeError: 'latin-1' codec can't encode character# 原因:windows系统编码不同# 参考链接:https://stackoverflow.com/questions/3942888/unicodeencodeerror-latin-1-codec-cant-encode-characterconn.set_charset("utf8")cur = conn.cursor()try: tag = True for row in rows: # 这里需要跳过第一行说明部分 if tag: tag = False continue csvRow = [] for cell in row.findAll('td'): csvRow.append(cell.getText()) print(csvRow) ''' 错误记录2: 由于爬取的网页都是字符串类型,但是数据库不支持格式转换写入, 即(%d,int(csvRow[0]))转换无效,所以只能将所有的类型保存为字符串类型 ''' # 一种写法 # cur.execute("insert into gdpList values(%s,%s,%s,%s,%s,%s)",(csvRow[0],csvRow[1],csvRow[2],csvRow[3],csvRow[4],csvRow[5])) # 另一种写法 sql = "insert into gdpList values(%s,%s,%s,%s,%s,%s)" cur.execute(sql,csvRow) # 提交 cur.connection.commit()finally: cur.close() conn.close()
数据库实现网络图的爬取(深度爬取)
简要谈谈数据库的优化:以下为爬取深度网络,并将网络链接对应信息保存;使用了两个数据库表
# 建数据库表CREATE TABLE `pages` (`id` INT NOT NULL AUTO_INCREMENT,`url` VARCHAR(255) NOT NULL,`created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,PRIMARY KEY (`id`))CREATE TABLE `links` (`id` INT NOT NULL AUTO_INCREMENT,`fromPageId` INT NOT NULL,`toPageId` INT NOT NULL,`created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,PRIMARY KEY (`id`))
""" @author: zoutai@file: linkToLink.py @time: 2018/01/24 @description: 爬取一个深度为4的网络,保存网络之间的连接关系(两两对应)"""import refrom urllib.request import urlopenimport pymysqlfrom bs4 import BeautifulSoupconn = pymysql.connect("localhost", "root", "root", "pythontes")conn.set_charset("utf8")cur = conn.cursor()# 定义一个set,对page进行去重pages = set()def insertLink(fromPageId, toPageId): cur.execute("select * from links where fromPageId = %s and toPageId = %s", (int(fromPageId), int(toPageId))) if cur.rowcount == 0: cur.execute("insert into links (fromPageId,toPageId) values (%s,%s)", (int(fromPageId), int(toPageId))) conn.commit()# 返回当前url存储的iddef insertPage(pageUrl): cur.execute("select * from pages where url = %s", (pageUrl)) if cur.rowcount == 0: cur.execute("insert into pages (url) values (%s)", pageUrl) conn.commit() return cur.lastrowid else: return cur.fetchone()[0]def getLinks(pageUrl, recursionLevel): global pages if recursionLevel < 0: return pageId = insertPage(pageUrl) html = urlopen("https://en.wikipedia.org" + pageUrl) bsObj = BeautifulSoup(html, "html5lib") for link in bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")): insertLink(pageId, insertPage(link.attrs["href"])) if link.attrs['href'] not in pages: newPage = link.attrs['href'] pages.add(newPage) getLinks(newPage, recursionLevel - 1)getLinks("/wiki/Kevin_Bacon", 2)
5.4 发送Email数据,使用163的SMTP邮件服务器
“””
@author: zoutai @file: sendEmail.py @time: 2018/01/24 @description: 邮件发送 “”“from smtplib import SMTPfrom email.mime.text import MIMETextfrom email.header import Headerdef send_email(SMTP_host, from_addr, password, to_addrs, subject, content): email_client = SMTP(SMTP_host) email_client.login(from_addr, password) # create msg msg = MIMEText(content,'plain','utf-8') msg['Subject'] = Header(subject, 'utf-8')#subject msg['From'] = 'soundslow' msg['To'] = "xx@qq.com" email_client.sendmail(from_addr, to_addrs, msg.as_string()) email_client.quit()if __name__ == "__main__": send_email("smtp.163.com","xxx@163.com","xxx","530337704@qq.com","来自163的邮件","今晚的卫星有点亮")
转载地址:http://yqepi.baihongyu.com/