本文共 4867 字,大约阅读时间需要 16 分钟。
近期在学习爬虫,利用Requests模块获取页面,BeautifulSoup来获取需要的内容,最后利用xlsxwriter模块讲内容保存至excel,在此记录一下,后续可举一反三,利用其抓取其他内容持久和存储到文件内,或数据库等。
编写了两个模块,geturl3和getexcel3,最后在main内调用
geturl3.py
代码内容如下:
#!/bin/env python# -*- coding:utf-8 -*-# @Author : kaliarchimport requestsfrom bs4 import BeautifulSoupclass get_urldic: #获取搜索关键字 def get_url(self): urlList = [] first_url = 'http://blog.51cto.com/search/result?q=' after_url = '&type=&page=' try: search = input("Please input search name:") page = int(input("Please input page:")) except Exception as e: print('Input error:',e) exit() for num in range(1,page+1): url = first_url + search + after_url + str(num) urlList.append(url) print("Please wait....") return urlList,search #获取网页文件 def get_html(self,urlList): response_list = [] for r_num in urlList: request = requests.get(r_num) response = request.content response_list.append(response) return response_list #获取blog_name和blog_url def get_soup(self,html_doc): result = {} for g_num in html_doc: soup = BeautifulSoup(g_num,'html.parser') context = soup.find_all('a',class_='m-1-4 fl') for i in context: title=i.get_text() result[title.strip()]=i['href'] return resultif __name__ == '__main__': blog = get_urldic() urllist, search = blog.get_url() html_doc = blog.get_html(urllist) result = blog.get_soup(html_doc) for k,v in result.items(): print('search blog_name is:%s,blog_url is:%s' % (k,v))
getexcel3.py
代码内容如下:
#!/bin/env python# -*- coding:utf-8 -*-# @Author : kaliarchimport xlsxwriterclass create_excle: def __init__(self): self.tag_list = ["blog_name", "blog_url"] def create_workbook(self,search=" "): excle_name = search + '.xlsx' #定义excle名称 workbook = xlsxwriter.Workbook(excle_name) worksheet_M = workbook.add_worksheet(search) print('create %s....' % excle_name) return workbook,worksheet_M def col_row(self,worksheet): worksheet.set_column('A:A', 12) worksheet.set_row(0, 17) worksheet.set_column('A:A',58) worksheet.set_column('B:B', 58) def shell_format(self,workbook): #表头格式 merge_format = workbook.add_format({ 'bold': 1, 'border': 1, 'align': 'center', 'valign': 'vcenter', 'fg_color': '#FAEBD7' }) #标题格式 name_format = workbook.add_format({ 'bold': 1, 'border': 1, 'align': 'center', 'valign': 'vcenter', 'fg_color': '#E0FFFF' }) #正文格式 normal_format = workbook.add_format({ 'align': 'center', }) return merge_format,name_format,normal_format #写入title和列名 def write_title(self,worksheet,search,merge_format): title = search + "搜索结果" worksheet.merge_range('A1:B1', title, merge_format) print('write title success') def write_tag(self,worksheet,name_format): tag_row = 1 tag_col = 0 for num in self.tag_list: worksheet.write(tag_row,tag_col,num,name_format) tag_col += 1 print('write tag success') #写入内容 def write_context(self,worksheet,con_dic,normal_format): row = 2 for k,v in con_dic.items(): if row > len(con_dic): break col = 0 worksheet.write(row,col,k,normal_format) col+=1 worksheet.write(row,col,v,normal_format) row+=1 print('write context success') #关闭excel def workbook_close(self,workbook): workbook.close()if __name__ == '__main__': print('This is create excel mode')
main.py
代码内容如下:
#!/bin/env python# -*- coding:utf-8 -*-# @Author : kaliarchimport geturl3import getexcel3#获取url字典def get_dic(): blog = geturl3.get_urldic() urllist, search = blog.get_url() html_doc = blog.get_html(urllist) result = blog.get_soup(html_doc) return result,search#写入excledef write_excle(urldic,search): excle = getexcel3.create_excle() workbook, worksheet = excle.create_workbook(search) excle.col_row(worksheet) merge_format, name_format, normal_format = excle.shell_format(workbook) excle.write_title(worksheet,search,merge_format) excle.write_tag(worksheet,name_format) excle.write_context(worksheet,urldic,normal_format) excle.workbook_close(workbook)def main(): url_dic ,search_name = get_dic() write_excle(url_dic,search_name)if __name__ == '__main__': main()
运行代码,填写搜索的关键字,及搜索多少页
查看会生成一个以搜索关键字命名的excel,打开写入的内容 利用其就可以搜索并保持自己需要的51CTO推荐博客,可以多搜索几个