58zhaopin/config_spider/pipelines.py
2020-04-28 00:51:52 +08:00

80 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
import pymysql.cursors
import random
import string
mongo = MongoClient(
host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
)
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
task_id = os.environ.get('CRAWLAB_TASK_ID')
class ConfigSpiderPipeline(object):
def __init__(self):
# mysql连接数据库
self.connect = pymysql.connect(
host=settings.MYSQL_HOST,
port=settings.MYSQL_PORT,
db=settings.MYSQL_DBNAME,
user=settings.MYSQL_USER,
passwd=settings.MYSQL_PASSWD,
charset='utf8',
use_unicode=True)
# 通过cursor执行增删查改
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
item['task_id'] = task_id
# 数据整理
if item['com_address']:
item['com_address'] = item['com_add']
if item['com_name']:
item['com_name'] = item['com_name2']
# 地址整理
com_address = item['com_address'].replace("住所:", '')
com_address = com_address.replace("-", '')
com_address = com_address.replace(" ", '')
# 清理公司介绍
item['com_dec'] = item['com_dec'].replace("使用58招人神器", '紧急')
item['com_dec'] = item['com_dec'].replace("【招才猫直聘APP】", '')
item['com_dec'] = item['com_dec'].replace("招才猫", '')
#item['com_dec'] = item['com_dec'].replace("58",'')
# 邮箱问题
if item['linkemail'] == "企业未添加招聘邮箱":
email = item['linkemail']
else:
email = ''.join(random.sample(string.ascii_letters + string.digits, 9))
email = email + "@des8.com"
# 企业用户查重 获得uid
self.cursor.execute(
"""select * from rc_member where username = %s""", item['com_name'])
repetition = self.cursor.fetchone()
if repetition:
uid = repetition[0]
else:
self.cursor.execute(
"""INSERT INTO `rencai`.`rc_member`(`uid`, `username`, `password`, `email`, `moblie`, `reg_ip`, `reg_date`, `login_ip`, `login_date`, `usertype`, `login_hits`, `salt`, `address`, `name_repeat`, `qqid`, `status`, `pwuid`, `pw_repeat`, `lock_info`, `email_status`, `signature`, `sinaid`, `wxid`, `wxopenid`, `unionid`, `wxname`, `wxbindtime`, `passtext`, `source`, `regcode`, `did`, `claim`, `restname`, `appeal`, `appealtime`, `appealstate`, `signday`, `signdays`, `xcxid`, `xcxname`, `xcxkey`, `sqgroup_id`) VALUES (0, '%s', '2fedf0d69979822e454e6a1d5cfd943a', '%s', '15318537111', '127.0.0.1', 1557929151, '223.104.188.249', 1561643616, 3, 3, 'fb58d9', '%s', 0, '', 1, 0, 0, '', NULL, '', '', '', '', '', '', 0, '', 1, 5, NULL, 0, 0, '', NULL, 1, 0, 0, NULL, '', '', 0);
""", # 纯属python操作mysql知识不熟悉请恶补
(item['com_name'], email, com_address))
# 提交sql语句
self.connect.commit()
uid = mycursor.lastrowid
print(uid)
if col is not None:
col.save(item)
return item