scrapy_lydzww_news/linyidzwww/items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst

#移除空格
def remove_spaces(text):
    return text.replace(" ", "")
# 使用replace方法移除换行符、制表符、换行符、
def remove_special_chars(text):
    cleaned_text = text.replace("\n", "").replace("\t", "").replace("\r", "").replace("\f", "").replace("\v", "")
    return cleaned_text
# 处理 \uXXXX格式的unicode字符
def replace_unicode(text):
    cleaned_text = text.replace("<!--zhengwen-->", "").replace("<!--/zhengwen-->", "").replace("news-con", "news-content")
    cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\"><strong>海报新闻出品</strong></p>", "")
    cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\">海报新闻出品</p>", "")
    cleaned_text = cleaned_text.replace("海报新闻出品", "")
    #cleaned_text = cleaned_text.encode('utf-8').decode('unicode_escape')
    print(cleaned_text)
    return cleaned_text
def remove_content_chars(text):
    cleaned_text = text.replace('<!--zhengwen-->', "").replace('<!--/zhengwen-->', "").replace("news-con", "news-content")
    cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\"><strong>海报新闻出品</strong></p>", "")
    cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\">海报新闻出品</p>", "")
    cleaned_text = cleaned_text.replace("海报新闻出品", "")
    return cleaned_text
class LinyidzwwwItem(scrapy.Item):
    title = scrapy.Field()#标题
    url = scrapy.Field()#链接
    datetime = scrapy.Field()#日期时间
    content = scrapy.Field(output_processor = MapCompose(remove_spaces,remove_special_chars,replace_unicode))#内容
    thumb = scrapy.Field()#封面图片
    source = scrapy.Field()#来源
    author = scrapy.Field()#作者