39 lines
1.9 KiB
Python
39 lines
1.9 KiB
Python
# Define here the models for your scraped items
|
|
#
|
|
# See documentation in:
|
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
|
|
|
import scrapy
|
|
from scrapy.loader.processors import MapCompose, TakeFirst
|
|
|
|
#移除空格
|
|
def remove_spaces(text):
|
|
return text.replace(" ", "")
|
|
# 使用replace方法移除换行符、制表符、换行符、
|
|
def remove_special_chars(text):
|
|
cleaned_text = text.replace("\n", "").replace("\t", "").replace("\r", "").replace("\f", "").replace("\v", "")
|
|
return cleaned_text
|
|
# 处理 \uXXXX格式的unicode字符
|
|
def replace_unicode(text):
|
|
cleaned_text = text.replace("<!--zhengwen-->", "").replace("<!--/zhengwen-->", "").replace("news-con", "news-content")
|
|
cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\"><strong>海报新闻出品</strong></p>", "")
|
|
cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\">海报新闻出品</p>", "")
|
|
cleaned_text = cleaned_text.replace("海报新闻出品", "")
|
|
#cleaned_text = cleaned_text.encode('utf-8').decode('unicode_escape')
|
|
print(cleaned_text)
|
|
return cleaned_text
|
|
def remove_content_chars(text):
|
|
cleaned_text = text.replace('<!--zhengwen-->', "").replace('<!--/zhengwen-->', "").replace("news-con", "news-content")
|
|
cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\"><strong>海报新闻出品</strong></p>", "")
|
|
cleaned_text = cleaned_text.replace("<p style=\"text-align: center;\">海报新闻出品</p>", "")
|
|
cleaned_text = cleaned_text.replace("海报新闻出品", "")
|
|
return cleaned_text
|
|
class LinyidzwwwItem(scrapy.Item):
|
|
title = scrapy.Field()#标题
|
|
url = scrapy.Field()#链接
|
|
datetime = scrapy.Field()#日期时间
|
|
content = scrapy.Field(output_processor = MapCompose(remove_spaces,remove_special_chars,replace_unicode))#内容
|
|
thumb = scrapy.Field()#封面图片
|
|
source = scrapy.Field()#来源
|
|
author = scrapy.Field()#作者
|