import scrapy import re from linyidzwww.items import LinyidzwwwItem,replace_unicode from scrapy.loader import ItemLoader class LinyiNewsSpider(scrapy.Spider): name = "linyinews" allowed_domains = ["linyi.dzwww.com"] start_urls = ["https://linyi.dzwww.com/news/index.htm"] # for num in range(1,29): # start_urls.append("https://linyi.dzwww.com/news/index_"+str(num)+".htm") def parse(self, response): for sel in response.css('div.tuwen>ul>li'): title = sel.css('h3>a::text').extract()[0] url = sel.css('h3>a::attr(href)').extract()[0] thumb = sel.css('i>a>img::attr(src)').extract()[0] if not thumb: thumb = '' source = sel.css('div.tail>span.left::text').extract()[1].strip() yield scrapy.Request(url, callback=self.parse_article, meta={'title': title, 'url': url, 'thumb': thumb,'source':source}) def parse_article(self, response): title = response.meta['title'] url = response.meta['url'] thumb = response.meta['thumb'] source = response.meta['source'] if not title: title = response.css('div.layout>h2::text').extract()[0] info = response.css('div.layout>div.left::text').extract()[0] # 匹配并提取时间 time_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})' match_time = re.search(time_pattern, info) if match_time: datetime = match_time.group(1).strip() else: datetime = '' if not source: # 匹配并提取来源 source_pattern = r'来源: (.*) 作者:' match_source = re.search(source_pattern, info) if match_source: source = match_source.group(1).strip() else: source = '' # 匹配并提取作者 author_pattern = r'作者: (.*)' match_author = re.search(author_pattern, info) if match_author: author = match_author.group(1).strip() else: author = '' # 匹配并提取内容 html_content = response.css('div.news-con')[0].get().replace("\n", "").strip() # content = re.sub(r'', '', html_content, flags=re.DOTALL) # 数据处理器 item = LinyidzwwwItem() item['title'] = title item['url'] = url item['thumb'] = thumb item['source'] = source item['datetime'] = datetime item['author'] = author item['content'] = replace_unicode(html_content) yield item