# -*- coding: utf-8 -*- import scrapy import re from config_spider.items import Item from urllib.parse import urljoin, urlparse def get_real_url(response, url): if re.search(r'^https?', url): return url elif re.search(r'^\/\/', url): u = urlparse(response.url) return u.scheme + url return urljoin(response.url, url) class ConfigSpider(scrapy.Spider): name = 'config_spider' def start_requests(self): yield scrapy.Request(url='https://linyi.58.com/renshizhuguan/pn3/?PGTID=0d302892-001f-99e7-35c5-0a69a314c4dd&ClickID=3', callback=self.parse_list) def parse_list(self, response): prev_item = response.meta.get('item') for elem in response.css('li.job_item'): item = Item() item['job_list_title'] = elem.css('span.name::text').extract_first() item['job_url'] = elem.css('div.job_name > a::attr("href")').extract_first() item['com_name1'] = elem.css('div.comp_name > a::text').extract_first() if prev_item is not None: for key, value in prev_item.items(): item[key] = value yield scrapy.Request(url=get_real_url(response, item['job_url']), callback=self.parse_job_info, meta={'item': item}) next_url = response.css('a.next::attr("href")').extract_first() yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_list, meta={'item': prev_item}) def parse_job_info(self, response): item = Item() if response.meta.get('item') is None else response.meta.get('item') item['job_title'] = response.css('span.pos_name::text').extract_first() item['job_num'] = response.css('span.pad_left_none::text').extract_first() item['job_xueli'] = response.css('.pos_base_condition > span:nth-last-child(2)::text').extract_first() item['job_yingjie'] = response.css('span.border_right_None::text').extract_first() item['job_des'] = response.css('div.des *::text').extract_first() item['com_jiesao'] = response.css('div.comp_intro *::text').extract_first() item['com_name2'] = response.css('div.baseInfo_link > a::text').extract_first() item['job_xinzi'] = response.css('span.pos_salary::text').extract_first() item['com_url'] = response.css('div.baseInfo_link > a::attr("href")').extract_first() yield scrapy.Request(url=get_real_url(response, item['com_url']), callback=self.parse_com_info, meta={'item': item}) def parse_com_info(self, response): item = Item() if response.meta.get('item') is None else response.meta.get('item') item['linkman'] = response.css('div.c_detail_item:nth-child(1) > em::text').extract_first() item['linkemail'] = response.css('div.c_detail:nth-child(2) > div.c_detail_item:nth-child(2) > em::text').extract_first() item['site_link'] = response.css('div.c_detail:nth-child(3) > div.c_detail_item:nth-child(2) > em::text').extract_first() item['linktelpic'] = response.css('div.phone-protect > img::attr("src")').extract_first() item['com_name'] = response.css('div.nan_title > h2::text').extract_first() item['com_hangye'] = response.css('div.basic > p:nth-child(3) *::text').extract_first() item['com_guimo'] = response.css('div.basic > p:nth-child(4) *::text').extract_first() item['com_manguimo'] = response.css('div.basic > p:nth-child(5) *::text').extract_first() item['com_fuli'] = response.css('div.welfare > div.w_label > span::text').extract_first() item['com_teshe'] = response.css('div.feature > div.w_label > span::text').extract_first() item['com_dec'] = response.css('div.introduction_box > p *::text').extract_first() item['com_fanwei'] = response.css('div.buiness > div.b_detail > p:nth-child(8) *::text').extract_first() item['com_address'] = response.css('div.buiness > div.b_detail > p:nth-child(9) *::text').extract_first() item['com_add'] = response.css('p.a_address::text').extract_first() yield item