依赖测试
This commit is contained in:
parent
1159685066
commit
8d630cba53
@ -7,6 +7,9 @@
|
||||
|
||||
import os
|
||||
from pymongo import MongoClient
|
||||
import pymysql.cursors
|
||||
import random
|
||||
import string
|
||||
|
||||
mongo = MongoClient(
|
||||
host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
|
||||
@ -19,9 +22,58 @@ db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
|
||||
col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
|
||||
task_id = os.environ.get('CRAWLAB_TASK_ID')
|
||||
|
||||
|
||||
class ConfigSpiderPipeline(object):
|
||||
def __init__(self):
|
||||
# mysql连接数据库
|
||||
self.connect = pymysql.connect(
|
||||
host=settings.MYSQL_HOST,
|
||||
port=settings.MYSQL_PORT,
|
||||
db=settings.MYSQL_DBNAME,
|
||||
user=settings.MYSQL_USER,
|
||||
passwd=settings.MYSQL_PASSWD,
|
||||
charset='utf8',
|
||||
use_unicode=True)
|
||||
# 通过cursor执行增删查改
|
||||
self.cursor = self.connect.cursor()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = task_id
|
||||
# 数据整理
|
||||
if item['com_address']:
|
||||
item['com_address'] = item['com_add']
|
||||
if item['com_name']:
|
||||
item['com_name'] = item['com_name2']
|
||||
# 地址整理
|
||||
com_address = item['com_address'].replace("住所:", '')
|
||||
com_address = com_address.replace("-", '')
|
||||
com_address = com_address.replace(" ", '')
|
||||
# 清理公司介绍
|
||||
item['com_dec'] = item['com_dec'].replace("使用58招人神器", '紧急')
|
||||
item['com_dec'] = item['com_dec'].replace("【招才猫直聘APP】", '')
|
||||
item['com_dec'] = item['com_dec'].replace("招才猫", '')
|
||||
#item['com_dec'] = item['com_dec'].replace("58",'')
|
||||
# 邮箱问题
|
||||
if item['linkemail'] == "企业未添加招聘邮箱":
|
||||
email = item['linkemail']
|
||||
else:
|
||||
email = ''.join(random.sample(string.ascii_letters + string.digits, 9))
|
||||
email = email + "@des8.com"
|
||||
# 企业用户查重 获得uid
|
||||
self.cursor.execute(
|
||||
"""select * from rc_member where username = %s""", item['com_name'])
|
||||
repetition = self.cursor.fetchone()
|
||||
if repetition:
|
||||
uid = repetition[0]
|
||||
else:
|
||||
self.cursor.execute(
|
||||
"""INSERT INTO `rencai`.`rc_member`(`uid`, `username`, `password`, `email`, `moblie`, `reg_ip`, `reg_date`, `login_ip`, `login_date`, `usertype`, `login_hits`, `salt`, `address`, `name_repeat`, `qqid`, `status`, `pwuid`, `pw_repeat`, `lock_info`, `email_status`, `signature`, `sinaid`, `wxid`, `wxopenid`, `unionid`, `wxname`, `wxbindtime`, `passtext`, `source`, `regcode`, `did`, `claim`, `restname`, `appeal`, `appealtime`, `appealstate`, `signday`, `signdays`, `xcxid`, `xcxname`, `xcxkey`, `sqgroup_id`) VALUES (0, '%s', '2fedf0d69979822e454e6a1d5cfd943a', '%s', '15318537111', '127.0.0.1', 1557929151, '223.104.188.249', 1561643616, 3, 3, 'fb58d9', '%s', 0, '', 1, 0, 0, '', NULL, '', '', '', '', '', '', 0, '', 1, 5, NULL, 0, 0, '', NULL, 1, 0, 0, NULL, '', '', 0);
|
||||
""", # 纯属python操作mysql知识,不熟悉请恶补
|
||||
(item['com_name'], email, com_address))
|
||||
# 提交sql语句
|
||||
self.connect.commit()
|
||||
uid = mycursor.lastrowid
|
||||
print(uid)
|
||||
if col is not None:
|
||||
col.save(item)
|
||||
return item
|
||||
|
@ -12,17 +12,17 @@ import json
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'Crawlab Configurable Spider'
|
||||
BOT_NAME = '58zhaopin_git'
|
||||
|
||||
SPIDER_MODULES = ['config_spider.spiders']
|
||||
NEWSPIDER_MODULE = 'config_spider.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Crawlab Spider'
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36 Edg/81.0.416.64'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
@ -38,8 +38,11 @@ class ConfigSpider(scrapy.Spider):
|
||||
item['job_num'] = response.css('span.pad_left_none::text').extract_first()
|
||||
item['job_xueli'] = response.css('.pos_base_condition > span:nth-last-child(2)::text').extract_first()
|
||||
item['job_yingjie'] = response.css('span.border_right_None::text').extract_first()
|
||||
item['job_des'] = response.css('div.des *::text').extract_first()
|
||||
item['com_jiesao'] = response.css('div.comp_intro *::text').extract_first()
|
||||
job_des = response.css('div.des *::text').extract()
|
||||
item['job_des'] = "".join(job_des)
|
||||
jiesao = response.css('div.comp_intro > .txt *::text').extract()
|
||||
item['com_jiesao'] = "".join(jiesao)
|
||||
#item['com_jiesao'] = response.css('div.comp_intro > .txt *::text').extract_first()
|
||||
item['com_name2'] = response.css('div.baseInfo_link > a::text').extract_first()
|
||||
item['job_xinzi'] = response.css('span.pos_salary::text').extract_first()
|
||||
item['com_url'] = response.css('div.baseInfo_link > a::attr("href")').extract_first()
|
||||
@ -52,14 +55,16 @@ class ConfigSpider(scrapy.Spider):
|
||||
item['site_link'] = response.css('div.c_detail:nth-child(3) > div.c_detail_item:nth-child(2) > em::text').extract_first()
|
||||
item['linktelpic'] = response.css('div.phone-protect > img::attr("src")').extract_first()
|
||||
item['com_name'] = response.css('div.nan_title > h2::text').extract_first()
|
||||
item['com_hangye'] = response.css('div.basic > p:nth-child(3) *::text').extract_first()
|
||||
item['com_guimo'] = response.css('div.basic > p:nth-child(4) *::text').extract_first()
|
||||
item['com_manguimo'] = response.css('div.basic > p:nth-child(5) *::text').extract_first()
|
||||
item['com_fuli'] = response.css('div.welfare > div.w_label > span::text').extract_first()
|
||||
item['com_teshe'] = response.css('div.feature > div.w_label > span::text').extract_first()
|
||||
item['com_hangye'] = response.css('div.basic > p:nth-child(3) > span.fule *::text').extract_first()
|
||||
item['com_guimo'] = response.css('div.basic > p:nth-child(4) *::text').extract()[1]
|
||||
item['com_manguimo'] = response.css('div.basic > p:nth-child(5) *::text').extract()[1]
|
||||
fuli = response.css('div.welfare > div.w_label > span::text').extract()
|
||||
item['com_fuli'] = ",".join(fuli)
|
||||
teshe = response.css('div.feature > div.w_label > span::text').extract()
|
||||
item['com_teshe'] = ",".join(teshe)
|
||||
item['com_dec'] = response.css('div.introduction_box > p *::text').extract_first()
|
||||
item['com_fanwei'] = response.css('div.buiness > div.b_detail > p:nth-child(8) *::text').extract_first()
|
||||
item['com_address'] = response.css('div.buiness > div.b_detail > p:nth-child(9) *::text').extract_first()
|
||||
item['com_fanwei'] = response.css('div.buiness > div.b_detail > p:nth-child(7) *::text').extract()[1]
|
||||
item['com_address'] = response.css('div.buiness > div.b_detail > p:nth-child(8) *::text').extract()[1]
|
||||
item['com_add'] = response.css('p.a_address::text').extract_first()
|
||||
yield item
|
||||
|
||||
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
PyMySQL==0.9.3
|
||||
pyOpenSSL==19.1.0
|
Loading…
Reference in New Issue
Block a user