diff --git a/config_spider/pipelines.py b/config_spider/pipelines.py index 69af4c8..bc75abd 100644 --- a/config_spider/pipelines.py +++ b/config_spider/pipelines.py @@ -7,6 +7,9 @@ import os from pymongo import MongoClient +import pymysql.cursors +import random +import string mongo = MongoClient( host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost', @@ -19,9 +22,58 @@ db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test'] col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test'] task_id = os.environ.get('CRAWLAB_TASK_ID') + class ConfigSpiderPipeline(object): + def __init__(self): + # mysql连接数据库 + self.connect = pymysql.connect( + host=settings.MYSQL_HOST, + port=settings.MYSQL_PORT, + db=settings.MYSQL_DBNAME, + user=settings.MYSQL_USER, + passwd=settings.MYSQL_PASSWD, + charset='utf8', + use_unicode=True) + # 通过cursor执行增删查改 + self.cursor = self.connect.cursor() + def process_item(self, item, spider): item['task_id'] = task_id + # 数据整理 + if item['com_address']: + item['com_address'] = item['com_add'] + if item['com_name']: + item['com_name'] = item['com_name2'] + # 地址整理 + com_address = item['com_address'].replace("住所:", '') + com_address = com_address.replace("-", '') + com_address = com_address.replace(" ", '') + # 清理公司介绍 + item['com_dec'] = item['com_dec'].replace("使用58招人神器", '紧急') + item['com_dec'] = item['com_dec'].replace("【招才猫直聘APP】", '') + item['com_dec'] = item['com_dec'].replace("招才猫", '') + #item['com_dec'] = item['com_dec'].replace("58",'') + # 邮箱问题 + if item['linkemail'] == "企业未添加招聘邮箱": + email = item['linkemail'] + else: + email = ''.join(random.sample(string.ascii_letters + string.digits, 9)) + email = email + "@des8.com" + # 企业用户查重 获得uid + self.cursor.execute( + """select * from rc_member where username = %s""", item['com_name']) + repetition = self.cursor.fetchone() + if repetition: + uid = repetition[0] + else: + self.cursor.execute( + """INSERT INTO `rencai`.`rc_member`(`uid`, `username`, `password`, `email`, `moblie`, `reg_ip`, `reg_date`, `login_ip`, `login_date`, `usertype`, `login_hits`, `salt`, `address`, `name_repeat`, `qqid`, `status`, `pwuid`, `pw_repeat`, `lock_info`, `email_status`, `signature`, `sinaid`, `wxid`, `wxopenid`, `unionid`, `wxname`, `wxbindtime`, `passtext`, `source`, `regcode`, `did`, `claim`, `restname`, `appeal`, `appealtime`, `appealstate`, `signday`, `signdays`, `xcxid`, `xcxname`, `xcxkey`, `sqgroup_id`) VALUES (0, '%s', '2fedf0d69979822e454e6a1d5cfd943a', '%s', '15318537111', '127.0.0.1', 1557929151, '223.104.188.249', 1561643616, 3, 3, 'fb58d9', '%s', 0, '', 1, 0, 0, '', NULL, '', '', '', '', '', '', 0, '', 1, 5, NULL, 0, 0, '', NULL, 1, 0, 0, NULL, '', '', 0); + """, # 纯属python操作mysql知识,不熟悉请恶补 + (item['com_name'], email, com_address)) + # 提交sql语句 + self.connect.commit() + uid = mycursor.lastrowid + print(uid) if col is not None: col.save(item) return item diff --git a/config_spider/settings.py b/config_spider/settings.py index 4b0965f..f5400d8 100644 --- a/config_spider/settings.py +++ b/config_spider/settings.py @@ -12,17 +12,17 @@ import json # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html -BOT_NAME = 'Crawlab Configurable Spider' +BOT_NAME = '58zhaopin_git' SPIDER_MODULES = ['config_spider.spiders'] NEWSPIDER_MODULE = 'config_spider.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = 'Crawlab Spider' +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36 Edg/81.0.416.64' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 diff --git a/config_spider/spiders/spider.py b/config_spider/spiders/spider.py index 6bc823c..4791ca7 100644 --- a/config_spider/spiders/spider.py +++ b/config_spider/spiders/spider.py @@ -38,8 +38,11 @@ class ConfigSpider(scrapy.Spider): item['job_num'] = response.css('span.pad_left_none::text').extract_first() item['job_xueli'] = response.css('.pos_base_condition > span:nth-last-child(2)::text').extract_first() item['job_yingjie'] = response.css('span.border_right_None::text').extract_first() - item['job_des'] = response.css('div.des *::text').extract_first() - item['com_jiesao'] = response.css('div.comp_intro *::text').extract_first() + job_des = response.css('div.des *::text').extract() + item['job_des'] = "".join(job_des) + jiesao = response.css('div.comp_intro > .txt *::text').extract() + item['com_jiesao'] = "".join(jiesao) + #item['com_jiesao'] = response.css('div.comp_intro > .txt *::text').extract_first() item['com_name2'] = response.css('div.baseInfo_link > a::text').extract_first() item['job_xinzi'] = response.css('span.pos_salary::text').extract_first() item['com_url'] = response.css('div.baseInfo_link > a::attr("href")').extract_first() @@ -52,14 +55,16 @@ class ConfigSpider(scrapy.Spider): item['site_link'] = response.css('div.c_detail:nth-child(3) > div.c_detail_item:nth-child(2) > em::text').extract_first() item['linktelpic'] = response.css('div.phone-protect > img::attr("src")').extract_first() item['com_name'] = response.css('div.nan_title > h2::text').extract_first() - item['com_hangye'] = response.css('div.basic > p:nth-child(3) *::text').extract_first() - item['com_guimo'] = response.css('div.basic > p:nth-child(4) *::text').extract_first() - item['com_manguimo'] = response.css('div.basic > p:nth-child(5) *::text').extract_first() - item['com_fuli'] = response.css('div.welfare > div.w_label > span::text').extract_first() - item['com_teshe'] = response.css('div.feature > div.w_label > span::text').extract_first() + item['com_hangye'] = response.css('div.basic > p:nth-child(3) > span.fule *::text').extract_first() + item['com_guimo'] = response.css('div.basic > p:nth-child(4) *::text').extract()[1] + item['com_manguimo'] = response.css('div.basic > p:nth-child(5) *::text').extract()[1] + fuli = response.css('div.welfare > div.w_label > span::text').extract() + item['com_fuli'] = ",".join(fuli) + teshe = response.css('div.feature > div.w_label > span::text').extract() + item['com_teshe'] = ",".join(teshe) item['com_dec'] = response.css('div.introduction_box > p *::text').extract_first() - item['com_fanwei'] = response.css('div.buiness > div.b_detail > p:nth-child(8) *::text').extract_first() - item['com_address'] = response.css('div.buiness > div.b_detail > p:nth-child(9) *::text').extract_first() + item['com_fanwei'] = response.css('div.buiness > div.b_detail > p:nth-child(7) *::text').extract()[1] + item['com_address'] = response.css('div.buiness > div.b_detail > p:nth-child(8) *::text').extract()[1] item['com_add'] = response.css('p.a_address::text').extract_first() yield item diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5ff58dd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +PyMySQL==0.9.3 +pyOpenSSL==19.1.0