博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
爬取拉勾网所有python职位并保存到excel表格 对象方式
阅读量:5316 次
发布时间:2019-06-14

本文共 6851 字,大约阅读时间需要 22 分钟。

# 1.把之间案例,使用bs4,正则,xpath,进行数据提取。# 2.爬取拉钩网上的所有python职位。from urllib import request,parseimport json,random#导入xlsxwriter  主要用于生成excel表格对象import xlsxwriter#创建python的职位类class python_position:    def __init__(self,page,number):        self.page = page        self.number = number        #调用创建excel对象的函数        self.create_book()        self.page_chuli()        print(self.page,self.number)#1 0    #创建excel对象函数    def create_book(self):        #创建excel对象        self.workbook_attr = xlsxwriter.Workbook('test1.xlsx')        #创建表对象,用于写入字符串        self.worksheet = self.workbook_attr.add_worksheet('test1')    #处理页码数量函数    def page_chuli(self):        #每次页码不一样 也就是number不一样        for page in range(1, 31):            self.page = page            self.user_agent()            self.number += 1        self.workbook_attr.close()    def user_agent(self):        #浏览器列表,每次访问可以用不同的浏览器访问        user_agent_list = [        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36',        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0',        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0',        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36',        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"        ]        #随机选取一个浏览器访问        self.user_agent_str = random.choice(user_agent_list)        #调用拉钩函数        self.lagou()    def lagou(self):        #职位请求地址        base_url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0"        #判断是否是第一次访问,第二次访问data的值不一样        if self.page == 1:            first = 'true'        else:            first = 'false'        data = {            'first':first,            'pn':self.page,            'kd':'python'        }        print('page:',self.page)        #参数拼接及转码,生成是字符串格式,  注意:长度下面的headers用的到        data = parse.urlencode(data)        #一定要比较每次page不一样的时候headers的各项的细微差别  这个很重要 也是能否爬取数据的关键        #在这里Content-Length,User-Agent的值相对来说比较重要        headers = {            'Accept': 'application/json, text/javascript, */*; q=0.01',            # Accept-Encoding:gzip, deflate            'Accept-Language': 'zh-CN,zh;q=0.8',            'Connection': 'keep-alive',            'Content-Length': len(data),            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',            'Cookie': 'user_trace_token=20180310205250-ccfd21f6-5b57-4e04-b90c-5e547e18d391; LGUID=20180310205255-f3afa6e4-2461-11e8-a8b5-525400f775ce; hideSliderBanner20180305WithTopBannerC=1; X_HTTP_TOKEN=673c8ae0b29d830c65e9812a6aeeb211; ab_test_random_num=0; JSESSIONID=ABAAABAAADEAAFI0BD8484557BF60A48BF2BDD6AA4C5D33; _putrc=318C0D90043747B6123F89F2B170EADC; login=true; unick=%E5%BC%A0%E6%B3%A2; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; gate_login_token=d46c3e3008cb0364e7b47d9d261956a39273c72d679a1b0eb644e03620c100fa; TG-TRACK-CODE=index_navigation; _gid=GA1.2.1883607132.1520686376; _ga=GA1.2.2068283511.1520686375; LGSID=20180310215122-1e408aca-246a-11e8-a8ed-525400f775ce; LGRID=20180310233852-22b0d3ee-2479-11e8-a921-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1520686378,1520689884; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1520696337; SEARCH_ID=458b8d44186948ceb472c3d662f08528; index_location_city=%E5%8C%97%E4%BA%AC',            'Host': 'www.lagou.com',            'Origin': 'https://www.lagou.com',            'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput=',            "User-Agent": self.user_agent_str,            'X-Anit-Forge-Code': 0,            'X-Anit-Forge-Token': 'None',            'X-Requested-With': 'XMLHttpRequest'        }        req = request.Request(url=base_url,data=bytes(data,encoding='utf-8'),headers=headers)        response = request.urlopen(req)        html = response.read()        html = html.decode('utf-8')        #使用json格式化,生成一个字典,然后从字典里头取值就可以,下面就是取值的过程,想要啥就可以啥        json_data = json.loads(html)        # print(json_data)        positionResult = json_data['content']['positionResult']        # print(positionResult)        self.result_list = positionResult['result']        print(self.result_list)        self.workbook()    def workbook(self):        for result in self.result_list:            # print(len(result))            # print(len(result))            # print(result)            self.worksheet1(result)            self.number += 1            print(self.number)            print('~~~~~~~~~~~~~~~~~~~~~haha~~~~~~~~~~~~~~~~~~~~~~~')        # self.workbook_attr.close()        # print(self.number)    def worksheet1(self,result):        # print(self.worksheet,result,self.number)        keys = list(result.keys())        if self.number == 0:            self.keys_list = keys            for i in range(len(keys)):                # 向work中添加数据 def write(self, row, col, *args):                # write第一个参数是行,第二个参数是列,第三个是要写入的参数名称                print(keys[i], type(keys[i]),i)                self.worksheet.write(0, i, str(keys[i]))                print('~~~~~~~~~~~~~~~~~~~~0~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')        values = []        for k in self.keys_list:            values.append(result[k])        # print(type(keys))        # values = list(result.values())        # print(type(values))        for i in range(len(values)):            # 向work中添加数据 def write(self, row, col, *args):            # write第一个参数是行,第二个参数是列,第三个是要写入的参数名称            print(str(values[i]),i,len(values))            self.worksheet.write(self.number+1, i, str(values[i]))            print('------------------------1-------------------------------')        # workbook.close()if __name__ == '__main__':    lagou_python = python_position(1,0)

 

转载于:https://www.cnblogs.com/zhangboblogs/p/8546423.html

你可能感兴趣的文章
RxJS & Angular
查看>>
面向对象(多异常的声明与处理)
查看>>
MTK笔记
查看>>
ERROR: duplicate key value violates unique constraint "xxx"
查看>>
激活office 365 的启动文件
查看>>
无法根据中文查找
查看>>
[简讯]phpMyAdmin项目已迁移至GitHub
查看>>
转载 python多重继承C3算法
查看>>
【题解】 bzoj1597: [Usaco2008 Mar]土地购买 (动态规划+斜率优化)
查看>>
css文本溢出显示省略号
查看>>
git安装和简单配置
查看>>
面向对象:反射,双下方法
查看>>
鼠标悬停提示文本消息最简单的做法
查看>>
Java面向对象重要关键字
查看>>
课后作业-阅读任务-阅读提问-2
查看>>
面向对象设计中private,public,protected的访问控制原则及静态代码块的初始化顺序...
查看>>
fat32转ntfs ,Win7系统提示对于目标文件系统文件过大解决教程
查看>>
Awesome Adb——一份超全超详细的 ADB 用法大全
查看>>
shell cat 合并文件,合并数据库sql文件
查看>>
Android 将drawable下的图片转换成bitmap、Drawable
查看>>