JupyterLab/.ipynb_checkpoints/微信公众号文章爬取-checkpoint.ipynb
2024-01-18 21:11:41 +08:00

439 lines
24 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"# -*- coding:utf-8 -*-\n",
"import requests\n",
"import time\n",
"import csv\n",
"import json\n",
"from lxml import etree "
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"url=\"https://sd.dzwww.com/sdnews/default.htm\"\n",
"header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"res=requests.get(url,headers=header)\n",
"html=etree.HTML(res.text.encode(\"latin1\").decode(\"gbk\"))"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50\n",
"['盘点2019展望2020|打造样板,乡村振兴全面起势', '攻坚2020|攻坚关键在项目,山东各地抓项目如何“心中有数”“手中有招”?', '将原油码头“搬”到炼化企业门口 连接黄渤海“大油龙”全线贯通', '57个果品品牌获评齐鲁放心果品 葡萄拔得头筹苹果紧随其后', '“45度让路法”风靡网络 路遇特种车辆,如何礼让', '济南网约车细则再征民意2月1日起施行车辆需安装定位', '对有偿补课零容忍济南通报12起典型问题', '双招双引,看如何招得好留得住?大众日报整版经验来了', '国务院批复 青岛再添国家级功能区 直接选中这里', '济南:将项目资金分配权交给部门 把项目确定权交给基层', '不到一月,三次半路停摆! 唐骏箱货新车“带病”出厂就“添堵”', '问政山东丨携号转网靓号不给转还乱加业务包?省通信局长:有人为设置障碍', '问政山东丨“非实名”电话卡仍在售 店主打包票“只要不违法就不会被封”', '春运期间加开7列途经潍坊站的列车', '济南网约车门槛“有降有升”,高德等聚合平台也要承担审核责任', '青岛纺织服装行业如何互联网升级 专业人士给方案', '2020春运大幕将拉开 青岛铁路预计发送240万人次', '“天价高速费”最新进展山西方面退还1100元并道歉', '2020年将有这些新大学在山东落地 青岛成为主要建校地?', '自贸区青岛片区首个平台型经济落户西海岸新区', '世界级创新中心落户高新区 五年将聚集300家企业!', '定了!济南泉城路真正变身步行街,非驻区车辆将绕行外围', '自来水管道破损致济南经十路“成河”,他们泡水里抢修一夜没合眼', '“开门红”!2020年青岛首批专项债券80亿元顺利发行', '烟台芝罘大海阳过街天桥主体将于春节前完工预计3月底前投用', '推动市区一体融合发展烟台全长24公里的跨区输水管网正在启动', '济南一特色小镇项目规划有进展,将形成大规模办公区', '注意啦济南二环北路药山西路交叉口施工K127路临时调线', '14岁女孩长期排便困难就要带着“粪兜”过活八旬老教授say no', '山东首个!济南西站开通滴滴网约车专属通道', '有一种冷叫妈妈觉得你冷!孩子冬天穿太多反而更易感冒', '2020年山东全面放宽落户条件 这3个市要建成垃圾分类处理系统', '女子投诉快递员不送件竟遭登门暴打!韵达快递:已开除涉事员工', '问政山东丨两年拉黑了1000多个号码 骚扰电话“阴魂不散”该咋办?', '“雪窝”烟台清雪防滑有高招!巧用环保型融雪剂保通畅', '预防流感守护健康 医生提醒:幼儿、老人、孕妇需格外注意', '东营:雪夜一男子落水 4名民警“搭人梯”救人', '退役军人事务工作表彰奖励办法出炉 有这3个常设项目', '寻找青年“政治佳”第四季获奖作品抢先看!', '一年出警586万人次2019年济南实现街面“两抢”315天零发案', '山财原党委副书记车滨已出任德州学院党委书记', '济莱高铁全线完成招标 钢城区段已具备施工条件', '村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……', '手机用户呈低龄化,低劣内容借移动社交隐蔽传播', '@2020年高考生山东新高考及模拟工作权威解析来了', '淄博:隆冬季节 消防员趴在地上救助被压车底老人', '早上入院治疗晚上就能出院 山东首家省级日间病房开始运行', '山东:推动开发区体制机制改革 打造高质量发展示范区', '村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……', '车祸受害者治病花了30多万陷困境加害人又无力赔偿怎么办']\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4641108.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4641186.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4641111.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4641113.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4641112.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4641320.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4641277.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4641268.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4641259.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4641330.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688310.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688303.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688271.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688253.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688251.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688236.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688209.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688208.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688207.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688204.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688194.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688183.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688180.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688179.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688175.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688166.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688161.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688158.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688057.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688042.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688014.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4688003.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687984.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687966.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687911.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687892.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687870.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687834.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687823.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687816.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687803.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687802.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687773.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687736.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687735.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687732.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687654.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687605.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687592.htm\n",
"https://sd.dzwww.com/sdnews/202001/d20200109_4687587.htm\n"
]
}
],
"source": [
"#获取新闻标题\n",
"news_titles=[]\n",
"news_a=html.xpath('//div[3]/ul/li/h3/a')\n",
"print(len(news_a))\n",
"for a in news_a:\n",
" news_titles.append(a.xpath('string(.)').strip())\n",
"print(news_titles)\n",
"#获取新闻链接\n",
"news_links=html.xpath('//div[3]/ul/li/h3/a/@href')\n",
"for link in news_links:\n",
" print(link)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"盘点2019展望2020|打造样板,乡村振兴全面起势\n",
"2020 01/09 05:45\n",
"来源:大众日报\n",
"作者:毛鑫鑫 张 鹏 于新悦\n",
"攻坚2020|攻坚关键在项目,山东各地抓项目如何“心中有数”“手中有招”?\n",
"2020 01/09 06:17\n",
"来源:大众日报客户端\n",
"作者:禹亚宁 孙秀岭\n",
"将原油码头“搬”到炼化企业门口 连接黄渤海“大油龙”全线贯通\n",
"2020 01/09 05:47\n",
"来源:大众日报\n",
"作者:李媛 王晶\n",
"57个果品品牌获评齐鲁放心果品 葡萄拔得头筹苹果紧随其后\n",
"2020 01/09 05:50\n",
"来源:大众日报\n",
"作者:\n",
"“45度让路法”风靡网络 路遇特种车辆,如何礼让\n",
"2020 01/09 05:50\n",
"来源:大众日报\n",
"作者:马海燕 张依盟\n",
"济南网约车细则再征民意2月1日起施行车辆需安装定位\n",
"2020 01/09 06:43\n",
"来源:济南时报\n",
"作者:\n",
"对有偿补课零容忍济南通报12起典型问题\n",
"2020 01/09 06:38\n",
"来源:济南日报\n",
"作者:\n",
"双招双引,看如何招得好留得住?大众日报整版经验来了\n",
"2020 01/09 06:37\n",
"来源:大众日报\n",
"作者:张晓帆 吕光社 杨国胜 张 蓓\n",
"国务院批复 青岛再添国家级功能区 直接选中这里\n",
"2020 01/09 06:36\n",
"来源:青岛新闻网\n",
"作者:\n",
"济南:将项目资金分配权交给部门 把项目确定权交给基层\n",
"2020 01/09 06:45\n",
"来源:济南日报\n",
"作者:\n",
"不到一月,三次半路停摆! 唐骏箱货新车“带病”出厂就“添堵”\n",
"2020 01/09 21:30\n",
"来源:大众网·海报新闻\n",
"作者:石磊\n",
"问政山东丨携号转网靓号不给转还乱加业务包?省通信局长:有人为设置障碍\n",
"2020 01/09 21:29\n",
"来源:大众网·海报新闻\n",
"作者:辛振东\n",
"问政山东丨“非实名”电话卡仍在售 店主打包票“只要不违法就不会被封”\n",
"2020 01/09 21:17\n",
"来源:大众网·海报新闻\n",
"作者:张稳\n",
"春运期间加开7列途经潍坊站的列车\n",
"2020 01/09 21:11\n",
"来源:大众日报客户端\n",
"作者:大众日报客户端记者 张蓓\n",
"济南网约车门槛“有降有升”,高德等聚合平台也要承担审核责任\n",
"2020 01/09 21:10\n",
"来源:齐鲁晚报·齐鲁壹点\n",
"作者:刘飞跃\n",
"青岛纺织服装行业如何互联网升级 专业人士给方案\n",
"2020 01/09 21:06\n",
"来源:青岛新闻网\n",
"作者:刘倩倩\n",
"2020春运大幕将拉开 青岛铁路预计发送240万人次\n",
"2020 01/09 21:02\n",
"来源:青岛新闻网\n",
"作者:江东旭\n",
"“天价高速费”最新进展山西方面退还1100元并道歉\n",
"2020 01/09 21:01\n",
"来源:大众网·海报新闻\n",
"作者:解强民\n",
"2020年将有这些新大学在山东落地 青岛成为主要建校地?\n",
"2020 01/09 21:01\n",
"来源:青岛新闻网\n",
"作者:\n",
"自贸区青岛片区首个平台型经济落户西海岸新区\n",
"2020 01/09 21:01\n",
"来源:青岛新闻网\n",
"作者:陈志伟\n",
"世界级创新中心落户高新区 五年将聚集300家企业!\n",
"2020 01/09 21:00\n",
"来源:青岛新闻网\n",
"作者:任俊峰\n",
"定了!济南泉城路真正变身步行街,非驻区车辆将绕行外围\n",
"2020 01/09 20:59\n",
"来源:齐鲁晚报·齐鲁壹点\n",
"作者:张泰来 杜亚慧\n",
"自来水管道破损致济南经十路“成河”,他们泡水里抢修一夜没合眼\n",
"2020 01/09 20:59\n",
"来源:齐鲁晚报·齐鲁壹点\n",
"作者:程凌润 孙雨杨\n",
"“开门红”!2020年青岛首批专项债券80亿元顺利发行\n",
"2020 01/09 20:58\n",
"来源:青岛新闻网\n",
"作者:刘倩倩\n",
"烟台芝罘大海阳过街天桥主体将于春节前完工预计3月底前投用\n",
"2020 01/09 20:58\n",
"来源:齐鲁晚报·齐鲁壹点\n",
"作者:张菁\n",
"推动市区一体融合发展烟台全长24公里的跨区输水管网正在启动\n",
"2020 01/09 20:57\n",
"来源:齐鲁晚报·齐鲁壹点\n",
"作者:曲彦霖\n",
"济南一特色小镇项目规划有进展,将形成大规模办公区\n",
"2020 01/09 20:57\n",
"来源:齐鲁晚报·齐鲁壹点\n",
"作者:吕琳\n",
"注意啦济南二环北路药山西路交叉口施工K127路临时调线\n",
"2020 01/09 20:57\n",
"来源:齐鲁晚报·齐鲁壹点\n",
"作者:刘飞跃\n",
"14岁女孩长期排便困难就要带着“粪兜”过活八旬老教授say no\n",
"2020 01/09 20:38\n",
"来源:大众网·海报新闻\n",
"作者:董昊骞\n",
"山东首个!济南西站开通滴滴网约车专属通道\n",
"2020 01/09 20:35\n",
"来源:大众网·海报新闻\n",
"作者:宋冰\n",
"有一种冷叫妈妈觉得你冷!孩子冬天穿太多反而更易感冒\n",
"2020 01/09 20:31\n",
"来源:大众网·海报新闻\n",
"作者:董昊骞\n",
"2020年山东全面放宽落户条件 这3个市要建成垃圾分类处理系统\n",
"2020 01/09 20:28\n",
"来源:大众网·海报新闻\n",
"作者:张稳 吕乐\n",
"女子投诉快递员不送件竟遭登门暴打!韵达快递:已开除涉事员工\n",
"2020 01/09 20:20\n",
"来源:大众网·海报新闻\n",
"作者:\n",
"问政山东丨两年拉黑了1000多个号码 骚扰电话“阴魂不散”该咋办?\n",
"2020 01/09 20:13\n",
"来源:大众网·海报新闻\n",
"作者:张稳\n",
"“雪窝”烟台清雪防滑有高招!巧用环保型融雪剂保通畅\n",
"2020 01/09 20:05\n",
"来源:大众网·海报新闻\n",
"作者:杨瑞远\n",
"预防流感守护健康 医生提醒:幼儿、老人、孕妇需格外注意\n",
"2020 01/09 20:04\n",
"来源:大众网·海报新闻\n",
"作者:吴宝杰\n",
"东营:雪夜一男子落水 4名民警“搭人梯”救人\n",
"2020 01/09 20:02\n",
"来源:大众网·海报新闻\n",
"作者:陈丽伟\n",
"退役军人事务工作表彰奖励办法出炉 有这3个常设项目\n",
"2020 01/09 20:00\n",
"来源:大众网·海报新闻\n",
"作者:秦文\n",
"寻找青年“政治佳”第四季获奖作品抢先看!\n",
"2020 01/09 19:57\n",
"来源:大众网·海报新闻\n",
"作者:\n",
"一年出警586万人次2019年济南实现街面“两抢”315天零发案\n",
"2020 01/09 19:55\n",
"来源:大众网·海报新闻\n",
"作者:张珈玮\n",
"山财原党委副书记车滨已出任德州学院党委书记\n",
"2020 01/09 19:51\n",
"来源:大众网·海报新闻\n",
"作者:孙杰\n",
"济莱高铁全线完成招标 钢城区段已具备施工条件\n",
"2020 01/09 19:50\n",
"来源:大众网·海报新闻\n",
"作者:解强民\n",
"村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……\n",
"2020 01/09 19:48\n",
"来源:大众网·海报新闻\n",
"作者:解强民\n",
"手机用户呈低龄化,低劣内容借移动社交隐蔽传播\n",
"2020 01/09 19:46\n",
"来源:新时报\n",
"作者:\n",
"@2020年高考生山东新高考及模拟工作权威解析来了\n",
"2020 01/09 19:46\n",
"来源:大众日报客户端\n",
"作者:大众日报客户端记者 王桂利\n",
"淄博:隆冬季节 消防员趴在地上救助被压车底老人\n",
"2020 01/09 19:45\n",
"来源:大众日报客户端\n",
"作者:大众日报记者 张依盟\n",
"早上入院治疗晚上就能出院 山东首家省级日间病房开始运行\n",
"2020 01/09 19:43\n",
"来源:大众网·海报新闻\n",
"作者:董昊骞\n",
"山东:推动开发区体制机制改革 打造高质量发展示范区\n",
"2020 01/09 19:35\n",
"来源:齐鲁网\n",
"作者:\n",
"村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……\n",
"2020 01/09 19:33\n",
"来源:大众网·海报新闻\n",
"作者:\n",
"车祸受害者治病花了30多万陷困境加害人又无力赔偿怎么办\n",
"2020 01/09 19:30\n",
"来源:齐鲁晚报·齐鲁壹点\n",
"作者:杨璐\n"
]
}
],
"source": [
"#爬取文章内容\n",
"import re\n",
"content=[]\n",
"for news_link in news_links:\n",
" c_res = requests.get(news_link,headers=header)\n",
" c_html = etree.HTML(c_res.text.encode(\"latin1\").decode(\"gbk\"))\n",
" s = etree.tostring(c_html).decode()\n",
" #print(s)\n",
" p = c_html.xpath('//div[@id=\"news-body\"]')\n",
" c_body = p[0].xpath('string(.)').strip()\n",
" c_body = re.sub('<[^<]+?>', '', c_body).replace('\\n', '').strip()\n",
" c_body = re.sub(r'<!--(.*?)-->', '', c_body)\n",
" #print(c_body)\n",
" p = c_html.xpath('//div[@id=\"news-head\"]/h2')\n",
" c_title = p[0].xpath('string(.)').strip()\n",
" print(c_title)\n",
" p = c_html.xpath('//div[@id=\"news-side\"]/div[1]')\n",
" c_time = p[0].xpath('string(.)').strip()\n",
" c_time = c_time.replace('\\n',' ')\n",
" c_time = c_time.replace('\\t','')\n",
" print(c_time)\n",
" p = c_html.xpath('//div[@id=\"news-side\"]/div[2]')\n",
" c_laiyuan = p[0].xpath('string(.)').strip()\n",
" c_laiyuan = c_laiyuan.replace('\\n','')\n",
" c_laiyuan = c_laiyuan.replace('\\t','')\n",
" print(c_laiyuan)\n",
" p = c_html.xpath('//div[@id=\"news-side\"]/div[3]')\n",
" c_zuozhe = p[0].xpath('string(.)').strip()\n",
" c_zuozhe = c_zuozhe.replace('\\n','')\n",
" c_zuozhe = c_zuozhe.replace('\\t','')\n",
" print(c_zuozhe)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'data/weixin_zhonyin.csv'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-34-c48b4fb69043>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m#实现多页爬取和存入csv\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"data/weixin_zhonyin.csv\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'a+'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnewline\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m''\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'utf-8'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mcsv_writer\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcsv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwriter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mtitles\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/weixin_zhonyin.csv'"
]
}
],
"source": [
"#实现多页爬取和存入csv\n",
"with open(\"data/weixin_zhonyin.csv\",'a+',newline='',encoding='utf-8') as file:\n",
" csv_writer=csv.writer(file)\n",
" for i in range(2):\n",
" titles=[]\n",
" contents=[]\n",
" url=\"http://weixin.sogou.com/weixin?oq=&query={}&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=1&_sug_=n&type=2&sst0=1509458456759&page={}&ie=utf8&p=40040108&dp=1&w=01015002&dr=1\".format(\"中印对峙\",i+1)\n",
" res=requests.get(url,headers=header)\n",
" html=etree.HTML(res.text)\n",
" news_a=html.xpath('//div[@class=\"news-box\"]/ul[@class=\"news-list\"]/li/div[@class=\"txt-box\"]/h3/a')\n",
" for a in news_a:\n",
" titles.append(a.xpath('string(.)').strip())\n",
"# print(titles)\n",
" links=html.xpath('//div[@class=\"news-box\"]/ul[@class=\"news-list\"]/li/div[@class=\"txt-box\"]/h3/a/@href')\n",
" for link in links:\n",
" c_res=requests.get(link,headers=header)\n",
" c_html=etree.HTML(c_res.text)\n",
" p=c_html.xpath('//div[@id=\"js_content\"]/p/text()')\n",
" if p:\n",
" contents.append('。'.join(p))\n",
" else:\n",
" p=c_html.xpath('//div[@id=\"js_content\"]/p/span/text()')\n",
" contents.append('。'.join(p))\n",
" for data in zip(titles,links,contents):\n",
" print(data)\n",
" csv_writer.writerow(list(data))\n",
" time.sleep(2)\n",
" print(\"第{}页爬取完毕\".format(i+1))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}