requests 职友集招聘信息爬虫

毕设收集招聘数据写的一个小爬虫，比较简单，就是爬了一会职友集的反爬会检测到，需要重新开始爬(PS:把浏览器访问职友集request header 里面的数据全部都放到requests的headers里面或许效果会好一点,我没有放全你们可以自己试一下，或者设置代理访问比较好)

这里是按地区爬网络安全工程师的，稍微改改就能自己用了。

建议

用自己登陆后的request header 里面的数据这样报错的概率比较小。

F12->NetWork->Doc 然后找到request header 里的数据复制下来按照格式放好。

# -*- coding: utf-8 -*-"""Created on Sat Feb 20 18:15:25 @author: zzy"""from bs4 import BeautifulSoup as bfimport requestsimport timefrom collections import dequeimport osimport jsonimport csvdef get():items=[]posqueue,numqueue=deque([]),deque([])#北京上海深圳广州# '/jobs?cityKw=%E5%8C%97%E4%BA%AC&jobKw=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8%E5%B7%A5%E7%A8%8B%E5%B8%88&n=','/jobs?jobKw=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8%E5%B7%A5%E7%A8%8B%E5%B8%88&cityKw=%E4%B8%8A%E6%B5%B7&n=','/jobs?jobKw=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8%E5%B7%A5%E7%A8%8B%E5%B8%88&cityKw=%E6%B7%B1%E5%9C%B3&n=',urls=['/jobs?cityKw=%E5%8C%97%E4%BA%AC&jobKw=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8%E5%B7%A5%E7%A8%8B%E5%B8%88&n=','/jobs?jobKw=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8%E5%B7%A5%E7%A8%8B%E5%B8%88&cityKw=%E4%B8%8A%E6%B5%B7&n=','/jobs?jobKw=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8%E5%B7%A5%E7%A8%8B%E5%B8%88&cityKw=%E6%B7%B1%E5%9C%B3&n=','/jobs?jobKw=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8%E5%B7%A5%E7%A8%8B%E5%B8%88&cityKw=%E5%B9%BF%E5%B7%9E&n=']cookie='jobui_p=1613901762779_44902703; AXUDE_jobuiinfo=PKHSerewtz; PHPSESSID=94f0q14lsqg18n9706hppdfha4; isAccessToken=4C5DC7967AE3A35CF1FD9F9DC3234A96; isloginType=qq; isloginOpenID=A0978770BC02EE56FF01E4143279F5AA; isloginStatus=fDR6uZuIgBc%3D; jobui_jobNotifyTime=1613901778; jobui_companyNotifyTime=1613901778; TN_VisitCookie=2; TN_VisitNum=2; jobui_notifiset=1; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1613901763,1613901778; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1613901778; jobui_notifyTime=1613901778; jobui_login_guide_popup=1'headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.9','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.71 Safari/537.36','Cookie':cookie}for url in urls:time.sleep(1)wb_data = requests.get(url, headers=headers)print(wb_data)#请求成功200soup = bf(wb_data.content, 'lxml')al=soup.find('span',attrs={'class':'sort-cut-result'}).span.get_text().strip()pos=str(soup.find('strong').get_text())[0:2]#print(pos)rang=int((int(al)/15))+2posqueue.append(pos)numqueue.append(rang)print(posqueue,numqueue)for url in urls:time.sleep(1)wb_data = requests.get(url, headers=headers)# print(wb_data)#请求成功200soup = bf(wb_data.content, 'lxml')pos=posqueue.popleft() #print(pos)rang=numqueue.popleft()print(pos,rang)for i in range(1,rang):wb_data = requests.get(url+str(i), headers=headers)time.sleep(1)soup = bf(wb_data.content, 'lxml')# print(soup)sp=soup.find('div',attrs={'class':'j-recommendJob'})# print(sp)tagsli=sp.find_all('div',attrs={'class':'c-job-list'})# print(tagsli)for tag in tagsli:item={'Company_name':'' , 'City':'' , 'Job':'' , 'Required':'' , 'NumOfStaff':'','CompanyType':'','Salary':''}item['Company_name']=tag.find('a',attrs={'class':'job-company-name'}).get_text().strip()item['City'] =pos #tag.find( 'span', attrs={'class':'frs-author-name-wrap'}).a.get_text().strip()item['Job'] = tag.find( 'a', attrs={'class' :'job-name'}).get_text().strip()required = tag.find( 'div', attrs={'class':'job-desc'}).find_all('span')#print(required[0].get_text(),required[1].get_text(),required[2].get_text())item['Required']=required[0].get_text().strip()+'|'+required[1].get_text().strip()company_information =tag.find('span', attrs={'class':'job-desc'}).get_text().strip()if len(company_information.split('|'))>1:item['NumOfStaff']=company_information.split('|')[1].strip()if len(company_information.split('|')[0].strip().split('，')) >1:item['CompanyType'] = company_information.split('|')[0].strip().split('，')[1].strip()else:item['CompanyType'] = company_information.split('|')[0].strip()item['Salary']= required[2].get_text().strip()items.append ( item) print(item,i)print("总数据条数:"+str(len(items)))saveToCsv( items, item, 'zhiyoujidata.csv')def saveToCsv(items,keyword_list,path):"""保存csv方法:param keyword_list: 保存文件的字段或者说是表头:param path: 保存文件路径和名字:param item: 要保存的字典对象:return:"""try:# 第一次打开文件时，第一行写入表头if not os.path.exists(path):with open(path, "w", newline='', encoding='utf-8') as csvfile: # newline='' 去除空白行writer = csv.DictWriter(csvfile, fieldnames=keyword_list) # 写字典的方法writer.writeheader() # 写表头的方法# 接下来追加写入内容with open(path, "a", newline='', encoding='utf-8') as csvfile: # newline='' 一定要写，否则写入数据有空白行writer = csv.DictWriter(csvfile, fieldnames=keyword_list)for i in range(len(items)):writer.writerow(items[i]) # 按行写入数据print("^_^ write success")except Exception as e:print("write error==>", e)# 记录错误数据with open("error.txt", "w") as f:f.write(json.dumps(items) + ",\n")passif __name__=='__main__':get()