依赖测试

2020-04-28 00:51:52 +08:00 · 2020-04-28 00:51:52 +08:00 · 8d630cba53
commit 8d630cba53
parent 1159685066
4 changed files with 71 additions and 12 deletions
--- a/config_spider/pipelines.py
+++ b/config_spider/pipelines.py
@ -7,6 +7,9 @@

 import os
 from pymongo import MongoClient
+import pymysql.cursors
+import random
+import string

 mongo = MongoClient(
    host=os.environ.get('CRAWLAB_MONGO_HOST') or 'localhost',
@ -19,9 +22,58 @@ db = mongo[os.environ.get('CRAWLAB_MONGO_DB') or 'test']
 col = db[os.environ.get('CRAWLAB_COLLECTION') or 'test']
 task_id = os.environ.get('CRAWLAB_TASK_ID')

+
 class ConfigSpiderPipeline(object):
+    def __init__(self):
+        # mysql连接数据库
+        self.connect = pymysql.connect(
+            host=settings.MYSQL_HOST,
+            port=settings.MYSQL_PORT,
+            db=settings.MYSQL_DBNAME,
+            user=settings.MYSQL_USER,
+            passwd=settings.MYSQL_PASSWD,
+            charset='utf8',
+            use_unicode=True)
+        # 通过cursor执行增删查改
+        self.cursor = self.connect.cursor()
+
    def process_item(self, item, spider):
        item['task_id'] = task_id
+        # 数据整理
+        if item['com_address']:
+            item['com_address'] = item['com_add']
+        if item['com_name']:
+            item['com_name'] = item['com_name2']
+        # 地址整理
+        com_address = item['com_address'].replace("住所：", '')
+        com_address = com_address.replace("-", '')
+        com_address = com_address.replace(" ", '')
+        # 清理公司介绍
+        item['com_dec'] = item['com_dec'].replace("使用58招人神器", '紧急')
+        item['com_dec'] = item['com_dec'].replace("【招才猫直聘APP】", '')
+        item['com_dec'] = item['com_dec'].replace("招才猫", '')
+        #item['com_dec'] = item['com_dec'].replace("58",'')
+        # 邮箱问题
+        if item['linkemail'] == "企业未添加招聘邮箱":
+            email = item['linkemail']
+        else:
+            email = ''.join(random.sample(string.ascii_letters + string.digits, 9))
+            email = email + "@des8.com"
+        # 企业用户查重 获得uid
+        self.cursor.execute(
+            """select * from rc_member where username = %s""", item['com_name'])
+        repetition = self.cursor.fetchone()
+        if repetition:
+            uid = repetition[0]
+        else:
+            self.cursor.execute(
+                """INSERT INTO `rencai`.`rc_member`(`uid`, `username`, `password`, `email`, `moblie`, `reg_ip`, `reg_date`, `login_ip`, `login_date`, `usertype`, `login_hits`, `salt`, `address`, `name_repeat`, `qqid`, `status`, `pwuid`, `pw_repeat`, `lock_info`, `email_status`, `signature`, `sinaid`, `wxid`, `wxopenid`, `unionid`, `wxname`, `wxbindtime`, `passtext`, `source`, `regcode`, `did`, `claim`, `restname`, `appeal`, `appealtime`, `appealstate`, `signday`, `signdays`, `xcxid`, `xcxname`, `xcxkey`, `sqgroup_id`) VALUES (0, '%s', '2fedf0d69979822e454e6a1d5cfd943a', '%s', '15318537111', '127.0.0.1', 1557929151, '223.104.188.249', 1561643616, 3, 3, 'fb58d9', '%s', 0, '', 1, 0, 0, '', NULL, '', '', '', '', '', '', 0, '', 1, 5, NULL, 0, 0, '', NULL, 1, 0, 0, NULL, '', '', 0);
+            """,  # 纯属python操作mysql知识，不熟悉请恶补
+                (item['com_name'], email, com_address))
+        # 提交sql语句
+        self.connect.commit()
+        uid = mycursor.lastrowid
+        print(uid)
        if col is not None:
            col.save(item)
        return item
--- a/config_spider/settings.py
+++ b/config_spider/settings.py
@ -12,17 +12,17 @@ import json
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

-BOT_NAME = 'Crawlab Configurable Spider'
+BOT_NAME = '58zhaopin_git'

 SPIDER_MODULES = ['config_spider.spiders']
 NEWSPIDER_MODULE = 'config_spider.spiders'


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-USER_AGENT = 'Crawlab Spider'
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36 Edg/81.0.416.64'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
--- a/config_spider/spiders/spider.py
+++ b/config_spider/spiders/spider.py
@ -38,8 +38,11 @@ class ConfigSpider(scrapy.Spider):
        item['job_num'] = response.css('span.pad_left_none::text').extract_first()
        item['job_xueli'] = response.css('.pos_base_condition > span:nth-last-child(2)::text').extract_first()
        item['job_yingjie'] = response.css('span.border_right_None::text').extract_first()
-        item['job_des'] = response.css('div.des *::text').extract_first()
-        item['com_jiesao'] = response.css('div.comp_intro *::text').extract_first()
+        job_des = response.css('div.des *::text').extract()
+        item['job_des'] = "".join(job_des)
+        jiesao = response.css('div.comp_intro > .txt *::text').extract()
+        item['com_jiesao'] = "".join(jiesao)
+        #item['com_jiesao'] = response.css('div.comp_intro > .txt *::text').extract_first()
        item['com_name2'] = response.css('div.baseInfo_link > a::text').extract_first()
        item['job_xinzi'] = response.css('span.pos_salary::text').extract_first()
        item['com_url'] = response.css('div.baseInfo_link > a::attr("href")').extract_first()
@ -52,14 +55,16 @@ class ConfigSpider(scrapy.Spider):
        item['site_link'] = response.css('div.c_detail:nth-child(3) > div.c_detail_item:nth-child(2) > em::text').extract_first()
        item['linktelpic'] = response.css('div.phone-protect > img::attr("src")').extract_first()
        item['com_name'] = response.css('div.nan_title > h2::text').extract_first()
-        item['com_hangye'] = response.css('div.basic > p:nth-child(3) *::text').extract_first()
-        item['com_guimo'] = response.css('div.basic > p:nth-child(4) *::text').extract_first()
-        item['com_manguimo'] = response.css('div.basic > p:nth-child(5) *::text').extract_first()
-        item['com_fuli'] = response.css('div.welfare > div.w_label > span::text').extract_first()
-        item['com_teshe'] = response.css('div.feature > div.w_label > span::text').extract_first()
+        item['com_hangye'] = response.css('div.basic > p:nth-child(3) > span.fule *::text').extract_first()
+        item['com_guimo'] = response.css('div.basic > p:nth-child(4) *::text').extract()[1]
+        item['com_manguimo'] = response.css('div.basic > p:nth-child(5) *::text').extract()[1]
+        fuli = response.css('div.welfare > div.w_label > span::text').extract()
+        item['com_fuli'] = ",".join(fuli)
+        teshe = response.css('div.feature > div.w_label > span::text').extract()
+        item['com_teshe'] = ",".join(teshe)
        item['com_dec'] = response.css('div.introduction_box > p *::text').extract_first()
-        item['com_fanwei'] = response.css('div.buiness > div.b_detail > p:nth-child(8) *::text').extract_first()
-        item['com_address'] = response.css('div.buiness > div.b_detail > p:nth-child(9) *::text').extract_first()
+        item['com_fanwei'] = response.css('div.buiness > div.b_detail > p:nth-child(7) *::text').extract()[1]
+        item['com_address'] = response.css('div.buiness > div.b_detail > p:nth-child(8) *::text').extract()[1]
        item['com_add'] = response.css('p.a_address::text').extract_first()
        yield item

--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+PyMySQL==0.9.3
+pyOpenSSL==19.1.0