58zhaopin/config_spider/spiders/spider.py
2020-04-28 00:51:52 +08:00

72 lines
4.2 KiB
Python

# -*- coding: utf-8 -*-
import scrapy
import re
from config_spider.items import Item
from urllib.parse import urljoin, urlparse
def get_real_url(response, url):
if re.search(r'^https?', url):
return url
elif re.search(r'^\/\/', url):
u = urlparse(response.url)
return u.scheme + url
return urljoin(response.url, url)
class ConfigSpider(scrapy.Spider):
name = 'config_spider'
def start_requests(self):
yield scrapy.Request(url='https://linyi.58.com/renshizhuguan/pn3/?PGTID=0d302892-001f-99e7-35c5-0a69a314c4dd&ClickID=3', callback=self.parse_list)
def parse_list(self, response):
prev_item = response.meta.get('item')
for elem in response.css('li.job_item'):
item = Item()
item['job_list_title'] = elem.css('span.name::text').extract_first()
item['job_url'] = elem.css('div.job_name > a::attr("href")').extract_first()
item['com_name1'] = elem.css('div.comp_name > a::text').extract_first()
if prev_item is not None:
for key, value in prev_item.items():
item[key] = value
yield scrapy.Request(url=get_real_url(response, item['job_url']), callback=self.parse_job_info, meta={'item': item})
next_url = response.css('a.next::attr("href")').extract_first()
yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item})
def parse_job_info(self, response):
item = Item() if response.meta.get('item') is None else response.meta.get('item')
item['job_title'] = response.css('span.pos_name::text').extract_first()
item['job_num'] = response.css('span.pad_left_none::text').extract_first()
item['job_xueli'] = response.css('.pos_base_condition > span:nth-last-child(2)::text').extract_first()
item['job_yingjie'] = response.css('span.border_right_None::text').extract_first()
job_des = response.css('div.des *::text').extract()
item['job_des'] = "".join(job_des)
jiesao = response.css('div.comp_intro > .txt *::text').extract()
item['com_jiesao'] = "".join(jiesao)
#item['com_jiesao'] = response.css('div.comp_intro > .txt *::text').extract_first()
item['com_name2'] = response.css('div.baseInfo_link > a::text').extract_first()
item['job_xinzi'] = response.css('span.pos_salary::text').extract_first()
item['com_url'] = response.css('div.baseInfo_link > a::attr("href")').extract_first()
yield scrapy.Request(url=get_real_url(response, item['com_url']), callback=self.parse_com_info, meta={'item': item})
def parse_com_info(self, response):
item = Item() if response.meta.get('item') is None else response.meta.get('item')
item['linkman'] = response.css('div.c_detail_item:nth-child(1) > em::text').extract_first()
item['linkemail'] = response.css('div.c_detail:nth-child(2) > div.c_detail_item:nth-child(2) > em::text').extract_first()
item['site_link'] = response.css('div.c_detail:nth-child(3) > div.c_detail_item:nth-child(2) > em::text').extract_first()
item['linktelpic'] = response.css('div.phone-protect > img::attr("src")').extract_first()
item['com_name'] = response.css('div.nan_title > h2::text').extract_first()
item['com_hangye'] = response.css('div.basic > p:nth-child(3) > span.fule *::text').extract_first()
item['com_guimo'] = response.css('div.basic > p:nth-child(4) *::text').extract()[1]
item['com_manguimo'] = response.css('div.basic > p:nth-child(5) *::text').extract()[1]
fuli = response.css('div.welfare > div.w_label > span::text').extract()
item['com_fuli'] = ",".join(fuli)
teshe = response.css('div.feature > div.w_label > span::text').extract()
item['com_teshe'] = ",".join(teshe)
item['com_dec'] = response.css('div.introduction_box > p *::text').extract_first()
item['com_fanwei'] = response.css('div.buiness > div.b_detail > p:nth-child(7) *::text').extract()[1]
item['com_address'] = response.css('div.buiness > div.b_detail > p:nth-child(8) *::text').extract()[1]
item['com_add'] = response.css('p.a_address::text').extract_first()
yield item