scrapy_lydzww_news/linyidzwww/spiders/LinyiNews.py
2024-01-25 01:48:36 +08:00

64 lines
2.5 KiB
Python

import scrapy
import re
from linyidzwww.items import LinyidzwwwItem,replace_unicode
from scrapy.loader import ItemLoader
class LinyiNewsSpider(scrapy.Spider):
name = "linyinews"
allowed_domains = ["linyi.dzwww.com"]
start_urls = ["https://linyi.dzwww.com/news/index.htm"]
# for num in range(1,29):
# start_urls.append("https://linyi.dzwww.com/news/index_"+str(num)+".htm")
def parse(self, response):
for sel in response.css('div.tuwen>ul>li'):
title = sel.css('h3>a::text').extract()[0]
url = sel.css('h3>a::attr(href)').extract()[0]
thumb = sel.css('i>a>img::attr(src)').extract()[0]
if not thumb:
thumb = ''
source = sel.css('div.tail>span.left::text').extract()[1].strip()
yield scrapy.Request(url, callback=self.parse_article, meta={'title': title, 'url': url, 'thumb': thumb,'source':source})
def parse_article(self, response):
title = response.meta['title']
url = response.meta['url']
thumb = response.meta['thumb']
source = response.meta['source']
if not title:
title = response.css('div.layout>h2::text').extract()[0]
info = response.css('div.layout>div.left::text').extract()[0]
# 匹配并提取时间
time_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
match_time = re.search(time_pattern, info)
if match_time:
datetime = match_time.group(1).strip()
else:
datetime = ''
if not source:
# 匹配并提取来源
source_pattern = r'来源: (.*) 作者:'
match_source = re.search(source_pattern, info)
if match_source:
source = match_source.group(1).strip()
else:
source = ''
# 匹配并提取作者
author_pattern = r'作者: (.*)'
match_author = re.search(author_pattern, info)
if match_author:
author = match_author.group(1).strip()
else:
author = ''
# 匹配并提取内容
html_content = response.css('div.news-con')[0].get().replace("\n", "").strip()
# content = re.sub(r'<video.*?</video>', '', html_content, flags=re.DOTALL)
# 数据处理器
item = LinyidzwwwItem()
item['title'] = title
item['url'] = url
item['thumb'] = thumb
item['source'] = source
item['datetime'] = datetime
item['author'] = author
item['content'] = replace_unicode(html_content)
yield item