From bc562ffb10f6869353ce44f29292983a3551bfff Mon Sep 17 00:00:00 2001 From: meng Date: Thu, 25 Jan 2024 01:48:36 +0800 Subject: [PATCH] init --- .gitignore | 1 + linyidzwww/__init__.py | 0 linyidzwww/items.py | 38 ++++++++++++ linyidzwww/middlewares.py | 103 ++++++++++++++++++++++++++++++++ linyidzwww/pipelines.py | 13 ++++ linyidzwww/settings.py | 102 +++++++++++++++++++++++++++++++ linyidzwww/spiders/LinyiNews.py | 63 +++++++++++++++++++ linyidzwww/spiders/__init__.py | 4 ++ requirements.txt | Bin 0 -> 108 bytes scrapy.cfg | 11 ++++ 10 files changed, 335 insertions(+) create mode 100644 .gitignore create mode 100644 linyidzwww/__init__.py create mode 100644 linyidzwww/items.py create mode 100644 linyidzwww/middlewares.py create mode 100644 linyidzwww/pipelines.py create mode 100644 linyidzwww/settings.py create mode 100644 linyidzwww/spiders/LinyiNews.py create mode 100644 linyidzwww/spiders/__init__.py create mode 100644 requirements.txt create mode 100644 scrapy.cfg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/linyidzwww/__init__.py b/linyidzwww/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/linyidzwww/items.py b/linyidzwww/items.py new file mode 100644 index 0000000..a0b0144 --- /dev/null +++ b/linyidzwww/items.py @@ -0,0 +1,38 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy +from scrapy.loader.processors import MapCompose, TakeFirst + +#移除空格 +def remove_spaces(text): + return text.replace(" ", "") +# 使用replace方法移除换行符、制表符、换行符、 +def remove_special_chars(text): + cleaned_text = text.replace("\n", "").replace("\t", "").replace("\r", "").replace("\f", "").replace("\v", "") + return cleaned_text +# 处理 \uXXXX格式的unicode字符 +def replace_unicode(text): + cleaned_text = text.replace("", "").replace("", "").replace("news-con", "news-content") + cleaned_text = cleaned_text.replace("

海报新闻出品

", "") + cleaned_text = cleaned_text.replace("

海报新闻出品

", "") + cleaned_text = cleaned_text.replace("海报新闻出品", "") + #cleaned_text = cleaned_text.encode('utf-8').decode('unicode_escape') + print(cleaned_text) + return cleaned_text +def remove_content_chars(text): + cleaned_text = text.replace('', "").replace('', "").replace("news-con", "news-content") + cleaned_text = cleaned_text.replace("

海报新闻出品

", "") + cleaned_text = cleaned_text.replace("

海报新闻出品

", "") + cleaned_text = cleaned_text.replace("海报新闻出品", "") + return cleaned_text +class LinyidzwwwItem(scrapy.Item): + title = scrapy.Field()#标题 + url = scrapy.Field()#链接 + datetime = scrapy.Field()#日期时间 + content = scrapy.Field(output_processor = MapCompose(remove_spaces,remove_special_chars,replace_unicode))#内容 + thumb = scrapy.Field()#封面图片 + source = scrapy.Field()#来源 + author = scrapy.Field()#作者 diff --git a/linyidzwww/middlewares.py b/linyidzwww/middlewares.py new file mode 100644 index 0000000..b04a45f --- /dev/null +++ b/linyidzwww/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class LinyidzwwwSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class LinyidzwwwDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/linyidzwww/pipelines.py b/linyidzwww/pipelines.py new file mode 100644 index 0000000..70acc4a --- /dev/null +++ b/linyidzwww/pipelines.py @@ -0,0 +1,13 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class LinyidzwwwPipeline: + def process_item(self, item, spider): + return item diff --git a/linyidzwww/settings.py b/linyidzwww/settings.py new file mode 100644 index 0000000..e915540 --- /dev/null +++ b/linyidzwww/settings.py @@ -0,0 +1,102 @@ +# Scrapy settings for linyidzwww project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "linyidzwww" + +SPIDER_MODULES = ["linyidzwww.spiders"] +NEWSPIDER_MODULE = "linyidzwww.spiders" + +# 使用scrapy-redis里的去重组件,不使用scrapy默认的去重方式 +#一个去重的类,用来将url去重 +DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" +#一个队列 +SCHEDULER = "scrapy_redis.scheduler.Scheduler" +#是否持久化(爬完后不会再爬了,像一些固定的数据) +SCHEDULER_PERSIST = True +#你的redis地址 +REDIS_URL = "redis://default:jhkdjhkjdhsIUTYURTU_hFBi3n@192.168.192.217:6379" +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/120.0.0.0" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +DEFAULT_REQUEST_HEADERS = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", + "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/120.0.0.0" +} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "linyidzwww.middlewares.LinyidzwwwSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# "linyidzwww.middlewares.LinyidzwwwDownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + "linyidzwww.pipelines.LinyidzwwwPipeline": 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" diff --git a/linyidzwww/spiders/LinyiNews.py b/linyidzwww/spiders/LinyiNews.py new file mode 100644 index 0000000..8301210 --- /dev/null +++ b/linyidzwww/spiders/LinyiNews.py @@ -0,0 +1,63 @@ +import scrapy +import re +from linyidzwww.items import LinyidzwwwItem,replace_unicode +from scrapy.loader import ItemLoader + +class LinyiNewsSpider(scrapy.Spider): + name = "linyinews" + allowed_domains = ["linyi.dzwww.com"] + start_urls = ["https://linyi.dzwww.com/news/index.htm"] + # for num in range(1,29): + # start_urls.append("https://linyi.dzwww.com/news/index_"+str(num)+".htm") + def parse(self, response): + for sel in response.css('div.tuwen>ul>li'): + title = sel.css('h3>a::text').extract()[0] + url = sel.css('h3>a::attr(href)').extract()[0] + thumb = sel.css('i>a>img::attr(src)').extract()[0] + if not thumb: + thumb = '' + source = sel.css('div.tail>span.left::text').extract()[1].strip() + yield scrapy.Request(url, callback=self.parse_article, meta={'title': title, 'url': url, 'thumb': thumb,'source':source}) + def parse_article(self, response): + title = response.meta['title'] + url = response.meta['url'] + thumb = response.meta['thumb'] + source = response.meta['source'] + if not title: + title = response.css('div.layout>h2::text').extract()[0] + info = response.css('div.layout>div.left::text').extract()[0] + # 匹配并提取时间 + time_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})' + match_time = re.search(time_pattern, info) + if match_time: + datetime = match_time.group(1).strip() + else: + datetime = '' + if not source: + # 匹配并提取来源 + source_pattern = r'来源: (.*) 作者:' + match_source = re.search(source_pattern, info) + if match_source: + source = match_source.group(1).strip() + else: + source = '' + # 匹配并提取作者 + author_pattern = r'作者: (.*)' + match_author = re.search(author_pattern, info) + if match_author: + author = match_author.group(1).strip() + else: + author = '' + # 匹配并提取内容 + html_content = response.css('div.news-con')[0].get().replace("\n", "").strip() + # content = re.sub(r'', '', html_content, flags=re.DOTALL) + # 数据处理器 + item = LinyidzwwwItem() + item['title'] = title + item['url'] = url + item['thumb'] = thumb + item['source'] = source + item['datetime'] = datetime + item['author'] = author + item['content'] = replace_unicode(html_content) + yield item diff --git a/linyidzwww/spiders/__init__.py b/linyidzwww/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/linyidzwww/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d1f0c79ada4ae26309ea9de084ad66b138d51d3 GIT binary patch literal 108 zcmezWFPI^jp@<=op@5;1!4?RO81xtnfl!aZfPt5Ri=h}vUKgk`l_7;86DSH&YXDSb Z4#dV_mE}M>2Pl_>WR4!tJdi0M3;-YS5BLB8 literal 0 HcmV?d00001 diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..627dcd1 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = linyidzwww.settings + +[deploy] +#url = http://localhost:6800/ +project = linyidzwww