scrapy_lydzww_news/linyidzwww/spiders/LinyiNews.py

import scrapy
import re
from linyidzwww.items import LinyidzwwwItem,replace_unicode
from scrapy.loader import ItemLoader

class LinyiNewsSpider(scrapy.Spider):
    name = "linyinews"
    allowed_domains = ["linyi.dzwww.com"]
    start_urls = ["https://linyi.dzwww.com/news/index.htm"]
    # for num in range(1,29):
    #     start_urls.append("https://linyi.dzwww.com/news/index_"+str(num)+".htm")
    def parse(self, response):
        for sel in response.css('div.tuwen>ul>li'):
            title = sel.css('h3>a::text').extract()[0]
            url = sel.css('h3>a::attr(href)').extract()[0]
            thumb = sel.css('i>a>img::attr(src)').extract()[0]
            if not thumb:
                thumb = ''
            source = sel.css('div.tail>span.left::text').extract()[1].strip()
            yield scrapy.Request(url, callback=self.parse_article, meta={'title': title, 'url': url, 'thumb': thumb,'source':source})
    def parse_article(self, response):
        title = response.meta['title']
        url = response.meta['url']
        thumb = response.meta['thumb']
        source = response.meta['source']
        if not title:
            title = response.css('div.layout>h2::text').extract()[0]
        info = response.css('div.layout>div.left::text').extract()[0]
        # 匹配并提取时间
        time_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
        match_time = re.search(time_pattern, info)
        if match_time:
            datetime = match_time.group(1).strip()
        else:
            datetime = ''
        if not source:
            # 匹配并提取来源
            source_pattern = r'来源: (.*)　作者:'
            match_source = re.search(source_pattern, info)
            if match_source:
                source = match_source.group(1).strip()
            else:
                source = ''
        # 匹配并提取作者
        author_pattern = r'作者: (.*)'
        match_author = re.search(author_pattern, info)
        if match_author:
            author = match_author.group(1).strip()
        else:
            author = ''
        # 匹配并提取内容
        html_content  = response.css('div.news-con')[0].get().replace("\n", "").strip()
        # content = re.sub(r'<video.*?</video>', '', html_content, flags=re.DOTALL)
        # 数据处理器
        item = LinyidzwwwItem()
        item['title'] = title
        item['url'] = url
        item['thumb'] = thumb
        item['source'] = source
        item['datetime'] = datetime
        item['author'] = author
        item['content'] = replace_unicode(html_content)
        yield item