# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy from scrapy.loader.processors import MapCompose, TakeFirst #移除空格 def remove_spaces(text): return text.replace(" ", "") # 使用replace方法移除换行符、制表符、换行符、 def remove_special_chars(text): cleaned_text = text.replace("\n", "").replace("\t", "").replace("\r", "").replace("\f", "").replace("\v", "") return cleaned_text # 处理 \uXXXX格式的unicode字符 def replace_unicode(text): cleaned_text = text.replace("", "").replace("", "").replace("news-con", "news-content") cleaned_text = cleaned_text.replace("

海报新闻出品

", "") cleaned_text = cleaned_text.replace("

海报新闻出品

", "") cleaned_text = cleaned_text.replace("海报新闻出品", "") #cleaned_text = cleaned_text.encode('utf-8').decode('unicode_escape') print(cleaned_text) return cleaned_text def remove_content_chars(text): cleaned_text = text.replace('', "").replace('', "").replace("news-con", "news-content") cleaned_text = cleaned_text.replace("

海报新闻出品

", "") cleaned_text = cleaned_text.replace("

海报新闻出品

", "") cleaned_text = cleaned_text.replace("海报新闻出品", "") return cleaned_text class LinyidzwwwItem(scrapy.Item): title = scrapy.Field()#标题 url = scrapy.Field()#链接 datetime = scrapy.Field()#日期时间 content = scrapy.Field(output_processor = MapCompose(remove_spaces,remove_special_chars,replace_unicode))#内容 thumb = scrapy.Field()#封面图片 source = scrapy.Field()#来源 author = scrapy.Field()#作者