80 lines
3.6 KiB
Python
80 lines
3.6 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
# Define your item pipelines here
|
||
#
|
||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||
|
||
import os
|
||
from pymongo import MongoClient
|
||
import pymysql.cursors
|
||
import random
|
||
import string
|
||
|
||
mongo = MongoClient(
|
||
host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
|
||
port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017),
|
||
username=os.environ.get('CRAWLAB_MONGO_USERNAME'),
|
||
password=os.environ.get('CRAWLAB_MONGO_PASSWORD'),
|
||
authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin'
|
||
)
|
||
db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
|
||
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
|
||
task_id = os.environ.get('CRAWLAB_TASK_ID')
|
||
|
||
|
||
class ConfigSpiderPipeline(object):
|
||
def __init__(self):
|
||
# mysql连接数据库
|
||
self.connect = pymysql.connect(
|
||
host=settings.MYSQL_HOST,
|
||
port=settings.MYSQL_PORT,
|
||
db=settings.MYSQL_DBNAME,
|
||
user=settings.MYSQL_USER,
|
||
passwd=settings.MYSQL_PASSWD,
|
||
charset='utf8',
|
||
use_unicode=True)
|
||
# 通过cursor执行增删查改
|
||
self.cursor = self.connect.cursor()
|
||
|
||
def process_item(self, item, spider):
|
||
item['task_id'] = task_id
|
||
# 数据整理
|
||
if item['com_address']:
|
||
item['com_address'] = item['com_add']
|
||
if item['com_name']:
|
||
item['com_name'] = item['com_name2']
|
||
# 地址整理
|
||
com_address = item['com_address'].replace("住所:", '')
|
||
com_address = com_address.replace("-", '')
|
||
com_address = com_address.replace(" ", '')
|
||
# 清理公司介绍
|
||
item['com_dec'] = item['com_dec'].replace("使用58招人神器", '紧急')
|
||
item['com_dec'] = item['com_dec'].replace("【招才猫直聘APP】", '')
|
||
item['com_dec'] = item['com_dec'].replace("招才猫", '')
|
||
#item['com_dec'] = item['com_dec'].replace("58",'')
|
||
# 邮箱问题
|
||
if item['linkemail'] == "企业未添加招聘邮箱":
|
||
email = item['linkemail']
|
||
else:
|
||
email = ''.join(random.sample(string.ascii_letters + string.digits, 9))
|
||
email = email + "@des8.com"
|
||
# 企业用户查重 获得uid
|
||
self.cursor.execute(
|
||
"""select * from rc_member where username = %s""", item['com_name'])
|
||
repetition = self.cursor.fetchone()
|
||
if repetition:
|
||
uid = repetition[0]
|
||
else:
|
||
self.cursor.execute(
|
||
"""INSERT INTO `rencai`.`rc_member`(`uid`, `username`, `password`, `email`, `moblie`, `reg_ip`, `reg_date`, `login_ip`, `login_date`, `usertype`, `login_hits`, `salt`, `address`, `name_repeat`, `qqid`, `status`, `pwuid`, `pw_repeat`, `lock_info`, `email_status`, `signature`, `sinaid`, `wxid`, `wxopenid`, `unionid`, `wxname`, `wxbindtime`, `passtext`, `source`, `regcode`, `did`, `claim`, `restname`, `appeal`, `appealtime`, `appealstate`, `signday`, `signdays`, `xcxid`, `xcxname`, `xcxkey`, `sqgroup_id`) VALUES (0, '%s', '2fedf0d69979822e454e6a1d5cfd943a', '%s', '15318537111', '127.0.0.1', 1557929151, '223.104.188.249', 1561643616, 3, 3, 'fb58d9', '%s', 0, '', 1, 0, 0, '', NULL, '', '', '', '', '', '', 0, '', 1, 5, NULL, 0, 0, '', NULL, 1, 0, 0, NULL, '', '', 0);
|
||
""", # 纯属python操作mysql知识,不熟悉请恶补
|
||
(item['com_name'], email, com_address))
|
||
# 提交sql语句
|
||
self.connect.commit()
|
||
uid = mycursor.lastrowid
|
||
print(uid)
|
||
if col is not None:
|
||
col.save(item)
|
||
return item
|