{ "cells": [ { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "# -*- coding:utf-8 -*-\n", "import requests\n", "import time\n", "import csv\n", "import json\n", "from lxml import etree " ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "url=\"https://sd.dzwww.com/sdnews/default.htm\"\n", "header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "res=requests.get(url,headers=header)\n", "html=etree.HTML(res.text.encode(\"latin1\").decode(\"gbk\"))" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "50\n", "['盘点2019展望2020|打造样板,乡村振兴全面起势', '攻坚2020|攻坚关键在项目,山东各地抓项目如何“心中有数”“手中有招”?', '将原油码头“搬”到炼化企业门口 连接黄渤海“大油龙”全线贯通', '57个果品品牌获评齐鲁放心果品 葡萄拔得头筹苹果紧随其后', '“45度让路法”风靡网络 路遇特种车辆,如何礼让', '济南网约车细则再征民意:2月1日起施行,车辆需安装定位', '对有偿补课零容忍!济南通报12起典型问题', '双招双引,看如何招得好留得住?大众日报整版经验来了', '国务院批复 青岛再添国家级功能区 直接选中这里', '济南:将项目资金分配权交给部门 把项目确定权交给基层', '不到一月,三次半路停摆! 唐骏箱货新车“带病”出厂就“添堵”', '问政山东丨携号转网靓号不给转还乱加业务包?省通信局长:有人为设置障碍', '问政山东丨“非实名”电话卡仍在售 店主打包票“只要不违法就不会被封”', '春运期间,加开7列途经潍坊站的列车', '济南网约车门槛“有降有升”,高德等聚合平台也要承担审核责任', '青岛纺织服装行业如何互联网升级 专业人士给方案', '2020春运大幕将拉开 青岛铁路预计发送240万人次', '“天价高速费”最新进展:山西方面退还1100元,并道歉!', '2020年将有这些新大学在山东落地 青岛成为主要建校地?', '自贸区青岛片区首个平台型经济落户西海岸新区', '世界级创新中心落户高新区 五年将聚集300家企业!', '定了!济南泉城路真正变身步行街,非驻区车辆将绕行外围', '自来水管道破损致济南经十路“成河”,他们泡水里抢修一夜没合眼', '“开门红”!2020年青岛首批专项债券80亿元顺利发行', '烟台芝罘大海阳过街天桥主体将于春节前完工,预计3月底前投用', '推动市区一体融合发展,烟台全长24公里的跨区输水管网正在启动', '济南一特色小镇项目规划有进展,将形成大规模办公区', '注意啦!济南二环北路药山西路交叉口施工,K127路临时调线', '14岁女孩长期排便困难就要带着“粪兜”过活?八旬老教授say no!', '山东首个!济南西站开通滴滴网约车专属通道', '有一种冷叫妈妈觉得你冷!孩子冬天穿太多反而更易感冒', '2020年山东全面放宽落户条件 这3个市要建成垃圾分类处理系统', '女子投诉快递员不送件竟遭登门暴打!韵达快递:已开除涉事员工', '问政山东丨两年拉黑了1000多个号码 骚扰电话“阴魂不散”该咋办?', '“雪窝”烟台清雪防滑有高招!巧用环保型融雪剂保通畅', '预防流感守护健康 医生提醒:幼儿、老人、孕妇需格外注意', '东营:雪夜一男子落水 4名民警“搭人梯”救人', '退役军人事务工作表彰奖励办法出炉 有这3个常设项目', '寻找青年“政治佳”第四季获奖作品抢先看!', '一年出警586万人次!2019年济南实现街面“两抢”315天零发案', '山财原党委副书记车滨已出任德州学院党委书记', '济莱高铁全线完成招标 钢城区段已具备施工条件', '村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……', '手机用户呈低龄化,低劣内容借移动社交隐蔽传播', '@2020年高考生!山东新高考及模拟工作权威解析来了', '淄博:隆冬季节 消防员趴在地上救助被压车底老人', '早上入院治疗晚上就能出院 山东首家省级日间病房开始运行', '山东:推动开发区体制机制改革 打造高质量发展示范区', '村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……', '车祸受害者治病花了30多万陷困境,加害人又无力赔偿怎么办?']\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4641108.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4641186.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4641111.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4641113.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4641112.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4641320.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4641277.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4641268.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4641259.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4641330.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688310.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688303.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688271.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688253.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688251.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688236.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688209.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688208.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688207.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688204.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688194.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688183.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688180.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688179.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688175.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688166.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688161.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688158.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688057.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688042.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688014.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4688003.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687984.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687966.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687911.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687892.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687870.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687834.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687823.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687816.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687803.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687802.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687773.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687736.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687735.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687732.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687654.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687605.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687592.htm\n", "https://sd.dzwww.com/sdnews/202001/d20200109_4687587.htm\n" ] } ], "source": [ "#获取新闻标题\n", "news_titles=[]\n", "news_a=html.xpath('//div[3]/ul/li/h3/a')\n", "print(len(news_a))\n", "for a in news_a:\n", " news_titles.append(a.xpath('string(.)').strip())\n", "print(news_titles)\n", "#获取新闻链接\n", "news_links=html.xpath('//div[3]/ul/li/h3/a/@href')\n", "for link in news_links:\n", " print(link)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "盘点2019展望2020|打造样板,乡村振兴全面起势\n", "2020 01/09 05:45\n", "来源:大众日报\n", "作者:毛鑫鑫 张 鹏 于新悦\n", "攻坚2020|攻坚关键在项目,山东各地抓项目如何“心中有数”“手中有招”?\n", "2020 01/09 06:17\n", "来源:大众日报客户端\n", "作者:禹亚宁 孙秀岭\n", "将原油码头“搬”到炼化企业门口 连接黄渤海“大油龙”全线贯通\n", "2020 01/09 05:47\n", "来源:大众日报\n", "作者:李媛 王晶\n", "57个果品品牌获评齐鲁放心果品 葡萄拔得头筹苹果紧随其后\n", "2020 01/09 05:50\n", "来源:大众日报\n", "作者:\n", "“45度让路法”风靡网络 路遇特种车辆,如何礼让\n", "2020 01/09 05:50\n", "来源:大众日报\n", "作者:马海燕 张依盟\n", "济南网约车细则再征民意:2月1日起施行,车辆需安装定位\n", "2020 01/09 06:43\n", "来源:济南时报\n", "作者:\n", "对有偿补课零容忍!济南通报12起典型问题\n", "2020 01/09 06:38\n", "来源:济南日报\n", "作者:\n", "双招双引,看如何招得好留得住?大众日报整版经验来了\n", "2020 01/09 06:37\n", "来源:大众日报\n", "作者:张晓帆 吕光社 杨国胜 张 蓓\n", "国务院批复 青岛再添国家级功能区 直接选中这里\n", "2020 01/09 06:36\n", "来源:青岛新闻网\n", "作者:\n", "济南:将项目资金分配权交给部门 把项目确定权交给基层\n", "2020 01/09 06:45\n", "来源:济南日报\n", "作者:\n", "不到一月,三次半路停摆! 唐骏箱货新车“带病”出厂就“添堵”\n", "2020 01/09 21:30\n", "来源:大众网·海报新闻\n", "作者:石磊\n", "问政山东丨携号转网靓号不给转还乱加业务包?省通信局长:有人为设置障碍\n", "2020 01/09 21:29\n", "来源:大众网·海报新闻\n", "作者:辛振东\n", "问政山东丨“非实名”电话卡仍在售 店主打包票“只要不违法就不会被封”\n", "2020 01/09 21:17\n", "来源:大众网·海报新闻\n", "作者:张稳\n", "春运期间,加开7列途经潍坊站的列车\n", "2020 01/09 21:11\n", "来源:大众日报客户端\n", "作者:大众日报客户端记者 张蓓\n", "济南网约车门槛“有降有升”,高德等聚合平台也要承担审核责任\n", "2020 01/09 21:10\n", "来源:齐鲁晚报·齐鲁壹点\n", "作者:刘飞跃\n", "青岛纺织服装行业如何互联网升级 专业人士给方案\n", "2020 01/09 21:06\n", "来源:青岛新闻网\n", "作者:刘倩倩\n", "2020春运大幕将拉开 青岛铁路预计发送240万人次\n", "2020 01/09 21:02\n", "来源:青岛新闻网\n", "作者:江东旭\n", "“天价高速费”最新进展:山西方面退还1100元,并道歉!\n", "2020 01/09 21:01\n", "来源:大众网·海报新闻\n", "作者:解强民\n", "2020年将有这些新大学在山东落地 青岛成为主要建校地?\n", "2020 01/09 21:01\n", "来源:青岛新闻网\n", "作者:\n", "自贸区青岛片区首个平台型经济落户西海岸新区\n", "2020 01/09 21:01\n", "来源:青岛新闻网\n", "作者:陈志伟\n", "世界级创新中心落户高新区 五年将聚集300家企业!\n", "2020 01/09 21:00\n", "来源:青岛新闻网\n", "作者:任俊峰\n", "定了!济南泉城路真正变身步行街,非驻区车辆将绕行外围\n", "2020 01/09 20:59\n", "来源:齐鲁晚报·齐鲁壹点\n", "作者:张泰来 杜亚慧\n", "自来水管道破损致济南经十路“成河”,他们泡水里抢修一夜没合眼\n", "2020 01/09 20:59\n", "来源:齐鲁晚报·齐鲁壹点\n", "作者:程凌润 孙雨杨\n", "“开门红”!2020年青岛首批专项债券80亿元顺利发行\n", "2020 01/09 20:58\n", "来源:青岛新闻网\n", "作者:刘倩倩\n", "烟台芝罘大海阳过街天桥主体将于春节前完工,预计3月底前投用\n", "2020 01/09 20:58\n", "来源:齐鲁晚报·齐鲁壹点\n", "作者:张菁\n", "推动市区一体融合发展,烟台全长24公里的跨区输水管网正在启动\n", "2020 01/09 20:57\n", "来源:齐鲁晚报·齐鲁壹点\n", "作者:曲彦霖\n", "济南一特色小镇项目规划有进展,将形成大规模办公区\n", "2020 01/09 20:57\n", "来源:齐鲁晚报·齐鲁壹点\n", "作者:吕琳\n", "注意啦!济南二环北路药山西路交叉口施工,K127路临时调线\n", "2020 01/09 20:57\n", "来源:齐鲁晚报·齐鲁壹点\n", "作者:刘飞跃\n", "14岁女孩长期排便困难就要带着“粪兜”过活?八旬老教授say no!\n", "2020 01/09 20:38\n", "来源:大众网·海报新闻\n", "作者:董昊骞\n", "山东首个!济南西站开通滴滴网约车专属通道\n", "2020 01/09 20:35\n", "来源:大众网·海报新闻\n", "作者:宋冰\n", "有一种冷叫妈妈觉得你冷!孩子冬天穿太多反而更易感冒\n", "2020 01/09 20:31\n", "来源:大众网·海报新闻\n", "作者:董昊骞\n", "2020年山东全面放宽落户条件 这3个市要建成垃圾分类处理系统\n", "2020 01/09 20:28\n", "来源:大众网·海报新闻\n", "作者:张稳 吕乐\n", "女子投诉快递员不送件竟遭登门暴打!韵达快递:已开除涉事员工\n", "2020 01/09 20:20\n", "来源:大众网·海报新闻\n", "作者:\n", "问政山东丨两年拉黑了1000多个号码 骚扰电话“阴魂不散”该咋办?\n", "2020 01/09 20:13\n", "来源:大众网·海报新闻\n", "作者:张稳\n", "“雪窝”烟台清雪防滑有高招!巧用环保型融雪剂保通畅\n", "2020 01/09 20:05\n", "来源:大众网·海报新闻\n", "作者:杨瑞远\n", "预防流感守护健康 医生提醒:幼儿、老人、孕妇需格外注意\n", "2020 01/09 20:04\n", "来源:大众网·海报新闻\n", "作者:吴宝杰\n", "东营:雪夜一男子落水 4名民警“搭人梯”救人\n", "2020 01/09 20:02\n", "来源:大众网·海报新闻\n", "作者:陈丽伟\n", "退役军人事务工作表彰奖励办法出炉 有这3个常设项目\n", "2020 01/09 20:00\n", "来源:大众网·海报新闻\n", "作者:秦文\n", "寻找青年“政治佳”第四季获奖作品抢先看!\n", "2020 01/09 19:57\n", "来源:大众网·海报新闻\n", "作者:\n", "一年出警586万人次!2019年济南实现街面“两抢”315天零发案\n", "2020 01/09 19:55\n", "来源:大众网·海报新闻\n", "作者:张珈玮\n", "山财原党委副书记车滨已出任德州学院党委书记\n", "2020 01/09 19:51\n", "来源:大众网·海报新闻\n", "作者:孙杰\n", "济莱高铁全线完成招标 钢城区段已具备施工条件\n", "2020 01/09 19:50\n", "来源:大众网·海报新闻\n", "作者:解强民\n", "村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……\n", "2020 01/09 19:48\n", "来源:大众网·海报新闻\n", "作者:解强民\n", "手机用户呈低龄化,低劣内容借移动社交隐蔽传播\n", "2020 01/09 19:46\n", "来源:新时报\n", "作者:\n", "@2020年高考生!山东新高考及模拟工作权威解析来了\n", "2020 01/09 19:46\n", "来源:大众日报客户端\n", "作者:大众日报客户端记者 王桂利\n", "淄博:隆冬季节 消防员趴在地上救助被压车底老人\n", "2020 01/09 19:45\n", "来源:大众日报客户端\n", "作者:大众日报记者 张依盟\n", "早上入院治疗晚上就能出院 山东首家省级日间病房开始运行\n", "2020 01/09 19:43\n", "来源:大众网·海报新闻\n", "作者:董昊骞\n", "山东:推动开发区体制机制改革 打造高质量发展示范区\n", "2020 01/09 19:35\n", "来源:齐鲁网\n", "作者:\n", "村民称“俺村进狼了偷鸡吃” 民警全副武装赶到结果……\n", "2020 01/09 19:33\n", "来源:大众网·海报新闻\n", "作者:\n", "车祸受害者治病花了30多万陷困境,加害人又无力赔偿怎么办?\n", "2020 01/09 19:30\n", "来源:齐鲁晚报·齐鲁壹点\n", "作者:杨璐\n" ] } ], "source": [ "#爬取文章内容\n", "import re\n", "content=[]\n", "for news_link in news_links:\n", " c_res = requests.get(news_link,headers=header)\n", " c_html = etree.HTML(c_res.text.encode(\"latin1\").decode(\"gbk\"))\n", " s = etree.tostring(c_html).decode()\n", " #print(s)\n", " p = c_html.xpath('//div[@id=\"news-body\"]')\n", " c_body = p[0].xpath('string(.)').strip()\n", " c_body = re.sub('<[^<]+?>', '', c_body).replace('\\n', '').strip()\n", " c_body = re.sub(r'', '', c_body)\n", " #print(c_body)\n", " p = c_html.xpath('//div[@id=\"news-head\"]/h2')\n", " c_title = p[0].xpath('string(.)').strip()\n", " print(c_title)\n", " p = c_html.xpath('//div[@id=\"news-side\"]/div[1]')\n", " c_time = p[0].xpath('string(.)').strip()\n", " c_time = c_time.replace('\\n',' ')\n", " c_time = c_time.replace('\\t','')\n", " print(c_time)\n", " p = c_html.xpath('//div[@id=\"news-side\"]/div[2]')\n", " c_laiyuan = p[0].xpath('string(.)').strip()\n", " c_laiyuan = c_laiyuan.replace('\\n','')\n", " c_laiyuan = c_laiyuan.replace('\\t','')\n", " print(c_laiyuan)\n", " p = c_html.xpath('//div[@id=\"news-side\"]/div[3]')\n", " c_zuozhe = p[0].xpath('string(.)').strip()\n", " c_zuozhe = c_zuozhe.replace('\\n','')\n", " c_zuozhe = c_zuozhe.replace('\\t','')\n", " print(c_zuozhe)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: 'data/weixin_zhonyin.csv'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m#实现多页爬取和存入csv\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"data/weixin_zhonyin.csv\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'a+'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnewline\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m''\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'utf-8'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mcsv_writer\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcsv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwriter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mtitles\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/weixin_zhonyin.csv'" ] } ], "source": [ "#实现多页爬取和存入csv\n", "with open(\"data/weixin_zhonyin.csv\",'a+',newline='',encoding='utf-8') as file:\n", " csv_writer=csv.writer(file)\n", " for i in range(2):\n", " titles=[]\n", " contents=[]\n", " url=\"http://weixin.sogou.com/weixin?oq=&query={}&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=1&_sug_=n&type=2&sst0=1509458456759&page={}&ie=utf8&p=40040108&dp=1&w=01015002&dr=1\".format(\"中印对峙\",i+1)\n", " res=requests.get(url,headers=header)\n", " html=etree.HTML(res.text)\n", " news_a=html.xpath('//div[@class=\"news-box\"]/ul[@class=\"news-list\"]/li/div[@class=\"txt-box\"]/h3/a')\n", " for a in news_a:\n", " titles.append(a.xpath('string(.)').strip())\n", "# print(titles)\n", " links=html.xpath('//div[@class=\"news-box\"]/ul[@class=\"news-list\"]/li/div[@class=\"txt-box\"]/h3/a/@href')\n", " for link in links:\n", " c_res=requests.get(link,headers=header)\n", " c_html=etree.HTML(c_res.text)\n", " p=c_html.xpath('//div[@id=\"js_content\"]/p/text()')\n", " if p:\n", " contents.append('。'.join(p))\n", " else:\n", " p=c_html.xpath('//div[@id=\"js_content\"]/p/span/text()')\n", " contents.append('。'.join(p))\n", " for data in zip(titles,links,contents):\n", " print(data)\n", " csv_writer.writerow(list(data))\n", " time.sleep(2)\n", " print(\"第{}页爬取完毕\".format(i+1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 4 }