439 lines
24 KiB
Plaintext
439 lines
24 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 70,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# -*- coding:utf-8 -*-\n",
|
||
"import requests\n",
|
||
"import time\n",
|
||
"import csv\n",
|
||
"import json\n",
|
||
"from lxml import etree "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 71,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"url=\"https://sd.dzwww.com/sdnews/default.htm\"\n",
|
||
"header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 72,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"res=requests.get(url,headers=header)\n",
|
||
"html=etree.HTML(res.text.encode(\"latin1\").decode(\"gbk\"))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 73,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"50\n",
|
||
"['盘点2019展望2020|打造样板,乡村振兴全面起势', '攻坚2020|攻坚关键在项目,山东各地抓项目如何“心中有数”“手中有招”?', '将原油码头“搬”到炼化企业门口 连接黄渤海“大油龙”全线贯通', '57个果品品牌获评齐鲁放心果品 葡萄拔得头筹苹果紧随其后', '“45度让路法”风靡网络 路遇特种车辆,如何礼让', '济南网约车细则再征民意:2月1日起施行,车辆需安装定位', '对有偿补课零容忍!济南通报12起典型问题', '双招双引,看如何招得好留得住?大众日报整版经验来了', '国务院批复 青岛再添国家级功能区 直接选中这里', '济南:将项目资金分配权交给部门 把项目确定权交给基层', '不到一月,三次半路停摆! 唐骏箱货新车“带病”出厂就“添堵”', '问政山东丨携号转网靓号不给转还乱加业务包?省通信局长:有人为设置障碍', '问政山东丨“非实名”电话卡仍在售 店主打包票“只要不违法就不会被封”', '春运期间,加开7列途经潍坊站的列车', '济南网约车门槛“有降有升”,高德等聚合平台也要承担审核责任', '青岛纺织服装行业如何互联网升级 专业人士给方案', '2020春运大幕将拉开 青岛铁路预计发送240万人次', '“天价高速费”最新进展:山西方面退还1100元,并道歉!', '2020年将有这些新大学在山东落地 青岛成为主要建校地?', '自贸区青岛片区首个平台型经济落户西海岸新区', '世界级创新中心落户高新区 五年将聚集300家企业!', '定了!济南泉城路真正变身步行街,非驻区车辆将绕行外围', '自来水管道破损致济南经十路“成河”,他们泡水里抢修一夜没合眼', '“开门红”!2020年青岛首批专项债券80亿元顺利发行', '烟台芝罘大海阳过街天桥主体将于春节前完工,预计3月底前投用', '推动市区一体融合发展,烟台全长24公里的跨区输水管网正在启动', '济南一特色小镇项目规划有进展,将形成大规模办公区', '注意啦!济南二环北路药山西路交叉口施工,K127路临时调线', '14岁女孩长期排便困难就要带着“粪兜”过活?八旬老教授say no!', '山东首个!济南西站开通滴滴网约车专属通道', '有一种冷叫妈妈觉得你冷!孩子冬天穿太多反而更易感冒', '2020年山东全面放宽落户条件 这3个市要建成垃圾分类处理系统', '女子投诉快递员不送件竟遭登门暴打!韵达快递:已开除涉事员工', '问政山东丨两年拉黑了1000多个号码 骚扰电话“阴魂不散”该咋办?', '“雪窝”烟台清雪防滑有高招!巧用环保型融雪剂保通畅', '预防流感守护健康 医生提醒:幼儿、老人、孕妇需格外注意', '东营:雪夜一男子落水 4名民警“搭人梯”救人', '退役军人事务工作表彰奖励办法出炉 有这3个常设项目', '寻找青年“政治佳”第四季获奖作品抢先看!', '一年出警586万人次!2019年济南实现街面“两抢”315天零发案', '山财原党委副书记车滨已出任德州学院党委书记', '济莱高铁全线完成招标 钢城区段已具备施工条件', '村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……', '手机用户呈低龄化,低劣内容借移动社交隐蔽传播', '@2020年高考生!山东新高考及模拟工作权威解析来了', '淄博:隆冬季节 消防员趴在地上救助被压车底老人', '早上入院治疗晚上就能出院 山东首家省级日间病房开始运行', '山东:推动开发区体制机制改革 打造高质量发展示范区', '村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……', '车祸受害者治病花了30多万陷困境,加害人又无力赔偿怎么办?']\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4641108.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4641186.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4641111.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4641113.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4641112.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4641320.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4641277.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4641268.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4641259.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4641330.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688310.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688303.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688271.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688253.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688251.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688236.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688209.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688208.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688207.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688204.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688194.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688183.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688180.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688179.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688175.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688166.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688161.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688158.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688057.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688042.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688014.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4688003.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687984.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687966.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687911.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687892.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687870.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687834.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687823.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687816.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687803.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687802.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687773.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687736.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687735.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687732.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687654.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687605.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687592.htm\n",
|
||
"https://sd.dzwww.com/sdnews/202001/d20200109_4687587.htm\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"#获取新闻标题\n",
|
||
"news_titles=[]\n",
|
||
"news_a=html.xpath('//div[3]/ul/li/h3/a')\n",
|
||
"print(len(news_a))\n",
|
||
"for a in news_a:\n",
|
||
" news_titles.append(a.xpath('string(.)').strip())\n",
|
||
"print(news_titles)\n",
|
||
"#获取新闻链接\n",
|
||
"news_links=html.xpath('//div[3]/ul/li/h3/a/@href')\n",
|
||
"for link in news_links:\n",
|
||
" print(link)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 95,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"盘点2019展望2020|打造样板,乡村振兴全面起势\n",
|
||
"2020 01/09 05:45\n",
|
||
"来源:大众日报\n",
|
||
"作者:毛鑫鑫 张 鹏 于新悦\n",
|
||
"攻坚2020|攻坚关键在项目,山东各地抓项目如何“心中有数”“手中有招”?\n",
|
||
"2020 01/09 06:17\n",
|
||
"来源:大众日报客户端\n",
|
||
"作者:禹亚宁 孙秀岭\n",
|
||
"将原油码头“搬”到炼化企业门口 连接黄渤海“大油龙”全线贯通\n",
|
||
"2020 01/09 05:47\n",
|
||
"来源:大众日报\n",
|
||
"作者:李媛 王晶\n",
|
||
"57个果品品牌获评齐鲁放心果品 葡萄拔得头筹苹果紧随其后\n",
|
||
"2020 01/09 05:50\n",
|
||
"来源:大众日报\n",
|
||
"作者:\n",
|
||
"“45度让路法”风靡网络 路遇特种车辆,如何礼让\n",
|
||
"2020 01/09 05:50\n",
|
||
"来源:大众日报\n",
|
||
"作者:马海燕 张依盟\n",
|
||
"济南网约车细则再征民意:2月1日起施行,车辆需安装定位\n",
|
||
"2020 01/09 06:43\n",
|
||
"来源:济南时报\n",
|
||
"作者:\n",
|
||
"对有偿补课零容忍!济南通报12起典型问题\n",
|
||
"2020 01/09 06:38\n",
|
||
"来源:济南日报\n",
|
||
"作者:\n",
|
||
"双招双引,看如何招得好留得住?大众日报整版经验来了\n",
|
||
"2020 01/09 06:37\n",
|
||
"来源:大众日报\n",
|
||
"作者:张晓帆 吕光社 杨国胜 张 蓓\n",
|
||
"国务院批复 青岛再添国家级功能区 直接选中这里\n",
|
||
"2020 01/09 06:36\n",
|
||
"来源:青岛新闻网\n",
|
||
"作者:\n",
|
||
"济南:将项目资金分配权交给部门 把项目确定权交给基层\n",
|
||
"2020 01/09 06:45\n",
|
||
"来源:济南日报\n",
|
||
"作者:\n",
|
||
"不到一月,三次半路停摆! 唐骏箱货新车“带病”出厂就“添堵”\n",
|
||
"2020 01/09 21:30\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:石磊\n",
|
||
"问政山东丨携号转网靓号不给转还乱加业务包?省通信局长:有人为设置障碍\n",
|
||
"2020 01/09 21:29\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:辛振东\n",
|
||
"问政山东丨“非实名”电话卡仍在售 店主打包票“只要不违法就不会被封”\n",
|
||
"2020 01/09 21:17\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:张稳\n",
|
||
"春运期间,加开7列途经潍坊站的列车\n",
|
||
"2020 01/09 21:11\n",
|
||
"来源:大众日报客户端\n",
|
||
"作者:大众日报客户端记者 张蓓\n",
|
||
"济南网约车门槛“有降有升”,高德等聚合平台也要承担审核责任\n",
|
||
"2020 01/09 21:10\n",
|
||
"来源:齐鲁晚报·齐鲁壹点\n",
|
||
"作者:刘飞跃\n",
|
||
"青岛纺织服装行业如何互联网升级 专业人士给方案\n",
|
||
"2020 01/09 21:06\n",
|
||
"来源:青岛新闻网\n",
|
||
"作者:刘倩倩\n",
|
||
"2020春运大幕将拉开 青岛铁路预计发送240万人次\n",
|
||
"2020 01/09 21:02\n",
|
||
"来源:青岛新闻网\n",
|
||
"作者:江东旭\n",
|
||
"“天价高速费”最新进展:山西方面退还1100元,并道歉!\n",
|
||
"2020 01/09 21:01\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:解强民\n",
|
||
"2020年将有这些新大学在山东落地 青岛成为主要建校地?\n",
|
||
"2020 01/09 21:01\n",
|
||
"来源:青岛新闻网\n",
|
||
"作者:\n",
|
||
"自贸区青岛片区首个平台型经济落户西海岸新区\n",
|
||
"2020 01/09 21:01\n",
|
||
"来源:青岛新闻网\n",
|
||
"作者:陈志伟\n",
|
||
"世界级创新中心落户高新区 五年将聚集300家企业!\n",
|
||
"2020 01/09 21:00\n",
|
||
"来源:青岛新闻网\n",
|
||
"作者:任俊峰\n",
|
||
"定了!济南泉城路真正变身步行街,非驻区车辆将绕行外围\n",
|
||
"2020 01/09 20:59\n",
|
||
"来源:齐鲁晚报·齐鲁壹点\n",
|
||
"作者:张泰来 杜亚慧\n",
|
||
"自来水管道破损致济南经十路“成河”,他们泡水里抢修一夜没合眼\n",
|
||
"2020 01/09 20:59\n",
|
||
"来源:齐鲁晚报·齐鲁壹点\n",
|
||
"作者:程凌润 孙雨杨\n",
|
||
"“开门红”!2020年青岛首批专项债券80亿元顺利发行\n",
|
||
"2020 01/09 20:58\n",
|
||
"来源:青岛新闻网\n",
|
||
"作者:刘倩倩\n",
|
||
"烟台芝罘大海阳过街天桥主体将于春节前完工,预计3月底前投用\n",
|
||
"2020 01/09 20:58\n",
|
||
"来源:齐鲁晚报·齐鲁壹点\n",
|
||
"作者:张菁\n",
|
||
"推动市区一体融合发展,烟台全长24公里的跨区输水管网正在启动\n",
|
||
"2020 01/09 20:57\n",
|
||
"来源:齐鲁晚报·齐鲁壹点\n",
|
||
"作者:曲彦霖\n",
|
||
"济南一特色小镇项目规划有进展,将形成大规模办公区\n",
|
||
"2020 01/09 20:57\n",
|
||
"来源:齐鲁晚报·齐鲁壹点\n",
|
||
"作者:吕琳\n",
|
||
"注意啦!济南二环北路药山西路交叉口施工,K127路临时调线\n",
|
||
"2020 01/09 20:57\n",
|
||
"来源:齐鲁晚报·齐鲁壹点\n",
|
||
"作者:刘飞跃\n",
|
||
"14岁女孩长期排便困难就要带着“粪兜”过活?八旬老教授say no!\n",
|
||
"2020 01/09 20:38\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:董昊骞\n",
|
||
"山东首个!济南西站开通滴滴网约车专属通道\n",
|
||
"2020 01/09 20:35\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:宋冰\n",
|
||
"有一种冷叫妈妈觉得你冷!孩子冬天穿太多反而更易感冒\n",
|
||
"2020 01/09 20:31\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:董昊骞\n",
|
||
"2020年山东全面放宽落户条件 这3个市要建成垃圾分类处理系统\n",
|
||
"2020 01/09 20:28\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:张稳 吕乐\n",
|
||
"女子投诉快递员不送件竟遭登门暴打!韵达快递:已开除涉事员工\n",
|
||
"2020 01/09 20:20\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:\n",
|
||
"问政山东丨两年拉黑了1000多个号码 骚扰电话“阴魂不散”该咋办?\n",
|
||
"2020 01/09 20:13\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:张稳\n",
|
||
"“雪窝”烟台清雪防滑有高招!巧用环保型融雪剂保通畅\n",
|
||
"2020 01/09 20:05\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:杨瑞远\n",
|
||
"预防流感守护健康 医生提醒:幼儿、老人、孕妇需格外注意\n",
|
||
"2020 01/09 20:04\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:吴宝杰\n",
|
||
"东营:雪夜一男子落水 4名民警“搭人梯”救人\n",
|
||
"2020 01/09 20:02\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:陈丽伟\n",
|
||
"退役军人事务工作表彰奖励办法出炉 有这3个常设项目\n",
|
||
"2020 01/09 20:00\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:秦文\n",
|
||
"寻找青年“政治佳”第四季获奖作品抢先看!\n",
|
||
"2020 01/09 19:57\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:\n",
|
||
"一年出警586万人次!2019年济南实现街面“两抢”315天零发案\n",
|
||
"2020 01/09 19:55\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:张珈玮\n",
|
||
"山财原党委副书记车滨已出任德州学院党委书记\n",
|
||
"2020 01/09 19:51\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:孙杰\n",
|
||
"济莱高铁全线完成招标 钢城区段已具备施工条件\n",
|
||
"2020 01/09 19:50\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:解强民\n",
|
||
"村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……\n",
|
||
"2020 01/09 19:48\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:解强民\n",
|
||
"手机用户呈低龄化,低劣内容借移动社交隐蔽传播\n",
|
||
"2020 01/09 19:46\n",
|
||
"来源:新时报\n",
|
||
"作者:\n",
|
||
"@2020年高考生!山东新高考及模拟工作权威解析来了\n",
|
||
"2020 01/09 19:46\n",
|
||
"来源:大众日报客户端\n",
|
||
"作者:大众日报客户端记者 王桂利\n",
|
||
"淄博:隆冬季节 消防员趴在地上救助被压车底老人\n",
|
||
"2020 01/09 19:45\n",
|
||
"来源:大众日报客户端\n",
|
||
"作者:大众日报记者 张依盟\n",
|
||
"早上入院治疗晚上就能出院 山东首家省级日间病房开始运行\n",
|
||
"2020 01/09 19:43\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:董昊骞\n",
|
||
"山东:推动开发区体制机制改革 打造高质量发展示范区\n",
|
||
"2020 01/09 19:35\n",
|
||
"来源:齐鲁网\n",
|
||
"作者:\n",
|
||
"村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……\n",
|
||
"2020 01/09 19:33\n",
|
||
"来源:大众网·海报新闻\n",
|
||
"作者:\n",
|
||
"车祸受害者治病花了30多万陷困境,加害人又无力赔偿怎么办?\n",
|
||
"2020 01/09 19:30\n",
|
||
"来源:齐鲁晚报·齐鲁壹点\n",
|
||
"作者:杨璐\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"#爬取文章内容\n",
|
||
"import re\n",
|
||
"content=[]\n",
|
||
"for news_link in news_links:\n",
|
||
" c_res = requests.get(news_link,headers=header)\n",
|
||
" c_html = etree.HTML(c_res.text.encode(\"latin1\").decode(\"gbk\"))\n",
|
||
" s = etree.tostring(c_html).decode()\n",
|
||
" #print(s)\n",
|
||
" p = c_html.xpath('//div[@id=\"news-body\"]')\n",
|
||
" c_body = p[0].xpath('string(.)').strip()\n",
|
||
" c_body = re.sub('<[^<]+?>', '', c_body).replace('\\n', '').strip()\n",
|
||
" c_body = re.sub(r'<!--(.*?)-->', '', c_body)\n",
|
||
" #print(c_body)\n",
|
||
" p = c_html.xpath('//div[@id=\"news-head\"]/h2')\n",
|
||
" c_title = p[0].xpath('string(.)').strip()\n",
|
||
" print(c_title)\n",
|
||
" p = c_html.xpath('//div[@id=\"news-side\"]/div[1]')\n",
|
||
" c_time = p[0].xpath('string(.)').strip()\n",
|
||
" c_time = c_time.replace('\\n',' ')\n",
|
||
" c_time = c_time.replace('\\t','')\n",
|
||
" print(c_time)\n",
|
||
" p = c_html.xpath('//div[@id=\"news-side\"]/div[2]')\n",
|
||
" c_laiyuan = p[0].xpath('string(.)').strip()\n",
|
||
" c_laiyuan = c_laiyuan.replace('\\n','')\n",
|
||
" c_laiyuan = c_laiyuan.replace('\\t','')\n",
|
||
" print(c_laiyuan)\n",
|
||
" p = c_html.xpath('//div[@id=\"news-side\"]/div[3]')\n",
|
||
" c_zuozhe = p[0].xpath('string(.)').strip()\n",
|
||
" c_zuozhe = c_zuozhe.replace('\\n','')\n",
|
||
" c_zuozhe = c_zuozhe.replace('\\t','')\n",
|
||
" print(c_zuozhe)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "FileNotFoundError",
|
||
"evalue": "[Errno 2] No such file or directory: 'data/weixin_zhonyin.csv'",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
||
"\u001b[1;32m<ipython-input-34-c48b4fb69043>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m#实现多页爬取和存入csv\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"data/weixin_zhonyin.csv\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'a+'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnewline\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m''\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'utf-8'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mcsv_writer\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcsv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwriter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mtitles\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/weixin_zhonyin.csv'"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"#实现多页爬取和存入csv\n",
|
||
"with open(\"data/weixin_zhonyin.csv\",'a+',newline='',encoding='utf-8') as file:\n",
|
||
" csv_writer=csv.writer(file)\n",
|
||
" for i in range(2):\n",
|
||
" titles=[]\n",
|
||
" contents=[]\n",
|
||
" url=\"http://weixin.sogou.com/weixin?oq=&query={}&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=1&_sug_=n&type=2&sst0=1509458456759&page={}&ie=utf8&p=40040108&dp=1&w=01015002&dr=1\".format(\"中印对峙\",i+1)\n",
|
||
" res=requests.get(url,headers=header)\n",
|
||
" html=etree.HTML(res.text)\n",
|
||
" news_a=html.xpath('//div[@class=\"news-box\"]/ul[@class=\"news-list\"]/li/div[@class=\"txt-box\"]/h3/a')\n",
|
||
" for a in news_a:\n",
|
||
" titles.append(a.xpath('string(.)').strip())\n",
|
||
"# print(titles)\n",
|
||
" links=html.xpath('//div[@class=\"news-box\"]/ul[@class=\"news-list\"]/li/div[@class=\"txt-box\"]/h3/a/@href')\n",
|
||
" for link in links:\n",
|
||
" c_res=requests.get(link,headers=header)\n",
|
||
" c_html=etree.HTML(c_res.text)\n",
|
||
" p=c_html.xpath('//div[@id=\"js_content\"]/p/text()')\n",
|
||
" if p:\n",
|
||
" contents.append('。'.join(p))\n",
|
||
" else:\n",
|
||
" p=c_html.xpath('//div[@id=\"js_content\"]/p/span/text()')\n",
|
||
" contents.append('。'.join(p))\n",
|
||
" for data in zip(titles,links,contents):\n",
|
||
" print(data)\n",
|
||
" csv_writer.writerow(list(data))\n",
|
||
" time.sleep(2)\n",
|
||
" print(\"第{}页爬取完毕\".format(i+1))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.4"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
}
|