scrapy_lydzww_news/linyidzwww/items.py
2024-01-25 01:48:36 +08:00

39 lines
1.9 KiB
Python

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst
#移除空格
def remove_spaces(text):
return text.replace(" ", "")
# 使用replace方法移除换行符、制表符、换行符、
def remove_special_chars(text):
cleaned_text = text.replace("\n", "").replace("\t", "").replace("\r", "").replace("\f", "").replace("\v", "")
return cleaned_text
# 处理 \uXXXX格式的unicode字符
def replace_unicode(text):
cleaned_text = text.replace("<!--zhengwen-->", "").replace("<!--/zhengwen-->", "").replace("news-con", "news-content")
cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\"><strong>海报新闻出品</strong></p>", "")
cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\">海报新闻出品</p>", "")
cleaned_text = cleaned_text.replace("海报新闻出品", "")
#cleaned_text = cleaned_text.encode('utf-8').decode('unicode_escape')
print(cleaned_text)
return cleaned_text
def remove_content_chars(text):
cleaned_text = text.replace('<!--zhengwen-->', "").replace('<!--/zhengwen-->', "").replace("news-con", "news-content")
cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\"><strong>海报新闻出品</strong></p>", "")
cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\">海报新闻出品</p>", "")
cleaned_text = cleaned_text.replace("海报新闻出品", "")
return cleaned_text
class LinyidzwwwItem(scrapy.Item):
title = scrapy.Field()#标题
url = scrapy.Field()#链接
datetime = scrapy.Field()#日期时间
content = scrapy.Field(output_processor = MapCompose(remove_spaces,remove_special_chars,replace_unicode))#内容
thumb = scrapy.Field()#封面图片
source = scrapy.Field()#来源
author = scrapy.Field()#作者