# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import os from pymongo import MongoClient import pymysql.cursors import random import string mongo = MongoClient( host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', port=int(os.environ.get('CRAWLAB_MONGO_PORT') or 27017), username=os.environ.get('CRAWLAB_MONGO_USERNAME'), password=os.environ.get('CRAWLAB_MONGO_PASSWORD'), authSource=os.environ.get('CRAWLAB_MONGO_AUTHSOURCE') or 'admin' ) db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] task_id = os.environ.get('CRAWLAB_TASK_ID') class ConfigSpiderPipeline(object): def __init__(self): # mysql连接数据库 self.connect = pymysql.connect( host=settings.MYSQL_HOST, port=settings.MYSQL_PORT, db=settings.MYSQL_DBNAME, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWD, charset='utf8', use_unicode=True) # 通过cursor执行增删查改 self.cursor = self.connect.cursor() def process_item(self, item, spider): item['task_id'] = task_id # 数据整理 if item['com_address']: item['com_address'] = item['com_add'] if item['com_name']: item['com_name'] = item['com_name2'] # 地址整理 com_address = item['com_address'].replace("住所:", '') com_address = com_address.replace("-", '') com_address = com_address.replace(" ", '') # 清理公司介绍 item['com_dec'] = item['com_dec'].replace("使用58招人神器", '紧急') item['com_dec'] = item['com_dec'].replace("【招才猫直聘APP】", '') item['com_dec'] = item['com_dec'].replace("招才猫", '') #item['com_dec'] = item['com_dec'].replace("58",'') # 邮箱问题 if item['linkemail'] == "企业未添加招聘邮箱": email = item['linkemail'] else: email = ''.join(random.sample(string.ascii_letters + string.digits, 9)) email = email + "@des8.com" # 企业用户查重 获得uid self.cursor.execute( """select * from rc_member where username = %s""", item['com_name']) repetition = self.cursor.fetchone() if repetition: uid = repetition[0] else: self.cursor.execute( """INSERT INTO `rencai`.`rc_member`(`uid`, `username`, `password`, `email`, `moblie`, `reg_ip`, `reg_date`, `login_ip`, `login_date`, `usertype`, `login_hits`, `salt`, `address`, `name_repeat`, `qqid`, `status`, `pwuid`, `pw_repeat`, `lock_info`, `email_status`, `signature`, `sinaid`, `wxid`, `wxopenid`, `unionid`, `wxname`, `wxbindtime`, `passtext`, `source`, `regcode`, `did`, `claim`, `restname`, `appeal`, `appealtime`, `appealstate`, `signday`, `signdays`, `xcxid`, `xcxname`, `xcxkey`, `sqgroup_id`) VALUES (0, '%s', '2fedf0d69979822e454e6a1d5cfd943a', '%s', '15318537111', '127.0.0.1', 1557929151, '223.104.188.249', 1561643616, 3, 3, 'fb58d9', '%s', 0, '', 1, 0, 0, '', NULL, '', '', '', '', '', '', 0, '', 1, 5, NULL, 0, 0, '', NULL, 1, 0, 0, NULL, '', '', 0); """, # 纯属python操作mysql知识,不熟悉请恶补 (item['com_name'], email, com_address)) # 提交sql语句 self.connect.commit() uid = mycursor.lastrowid print(uid) if col is not None: col.save(item) return item