64 lines
2.5 KiB
Python
64 lines
2.5 KiB
Python
import scrapy
|
|
import re
|
|
from linyidzwww.items import LinyidzwwwItem,replace_unicode
|
|
from scrapy.loader import ItemLoader
|
|
|
|
class LinyiNewsSpider(scrapy.Spider):
|
|
name = "linyinews"
|
|
allowed_domains = ["linyi.dzwww.com"]
|
|
start_urls = ["https://linyi.dzwww.com/news/index.htm"]
|
|
# for num in range(1,29):
|
|
# start_urls.append("https://linyi.dzwww.com/news/index_"+str(num)+".htm")
|
|
def parse(self, response):
|
|
for sel in response.css('div.tuwen>ul>li'):
|
|
title = sel.css('h3>a::text').extract()[0]
|
|
url = sel.css('h3>a::attr(href)').extract()[0]
|
|
thumb = sel.css('i>a>img::attr(src)').extract()[0]
|
|
if not thumb:
|
|
thumb = ''
|
|
source = sel.css('div.tail>span.left::text').extract()[1].strip()
|
|
yield scrapy.Request(url, callback=self.parse_article, meta={'title': title, 'url': url, 'thumb': thumb,'source':source})
|
|
def parse_article(self, response):
|
|
title = response.meta['title']
|
|
url = response.meta['url']
|
|
thumb = response.meta['thumb']
|
|
source = response.meta['source']
|
|
if not title:
|
|
title = response.css('div.layout>h2::text').extract()[0]
|
|
info = response.css('div.layout>div.left::text').extract()[0]
|
|
# 匹配并提取时间
|
|
time_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
|
|
match_time = re.search(time_pattern, info)
|
|
if match_time:
|
|
datetime = match_time.group(1).strip()
|
|
else:
|
|
datetime = ''
|
|
if not source:
|
|
# 匹配并提取来源
|
|
source_pattern = r'来源: (.*) 作者:'
|
|
match_source = re.search(source_pattern, info)
|
|
if match_source:
|
|
source = match_source.group(1).strip()
|
|
else:
|
|
source = ''
|
|
# 匹配并提取作者
|
|
author_pattern = r'作者: (.*)'
|
|
match_author = re.search(author_pattern, info)
|
|
if match_author:
|
|
author = match_author.group(1).strip()
|
|
else:
|
|
author = ''
|
|
# 匹配并提取内容
|
|
html_content = response.css('div.news-con')[0].get().replace("\n", "").strip()
|
|
# content = re.sub(r'<video.*?</video>', '', html_content, flags=re.DOTALL)
|
|
# 数据处理器
|
|
item = LinyidzwwwItem()
|
|
item['title'] = title
|
|
item['url'] = url
|
|
item['thumb'] = thumb
|
|
item['source'] = source
|
|
item['datetime'] = datetime
|
|
item['author'] = author
|
|
item['content'] = replace_unicode(html_content)
|
|
yield item
|