需要将网站数据导出到csv中,由于数据页很多,采用Python抓取就非常方便。
首先在浏览器中找到请求的接口,以及携带的参数,返回的结果样式,如图:
根据以上来抓取,
简单的Python抓取网站数据保存为csv代码如下:
import csv
import json
import ssl
import urllib.request
# 爬虫地址
url = 'https://xxxxx?entName=&licenseNo=&extractionDateStart=&extractionDateEnd=&checkTimeStart=&checkTimeEnd=&page={}&pageSize=150'
# 不校验证书
ssl._create_default_https_context = ssl._create_unverified_context
# 携带cookie进行访问
headers = {
'Host':'xxxx.com',
'Referer':'https://xxxx.com/',
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Cookie':'_tb_token_=iNkDeJLdM3MgvKjhsfdW; bs_n_lang=zh_CN; cna=aaj1EViI7x0CATo9kTKvjzgS; ck2=072de851f1c02d5c7bac555f64c5c66d; c_token=c74594b486f8de731e2608cb9526a3f2; an=5YWo5qOJ5pe25Luj5a6Y5pa55peX6Iiw5bqXOnpmeA%3D%3D; lg=true; sg=\"=19\"; lvc=sAhojs49PcqHQQ%3D%3D; isg=BPT0Md7dE_ic5Ie3Oa85RxaMxbLK3UqJMMiN6o5VjH8C-ZRDtt7aRXb3fXGEAVAP',
}
# 结果对象数据写入到csv的列
rows = []
# 总页数,这个因为我知道是721页,实际可以使用一次请求来获得
n = 720
# 循环每一页请求
while n <721:
# 格式化请求地址,更换请求参数
reqUrl = url.format(n)
# 打印本次请求地址
print(reqUrl)
# 发送请求,获取响应结果
request = urllib.request.Request(url=reqUrl, headers=headers)
response = urllib.request.urlopen(request)
text = response.read().decode('utf8')
# 打印本次请求响应内容
print(text)
# 将响应内容转换为Json对象
jsonobj = json.loads(text)
count = len(jsonobj['data']['list'])
for num in range(0,count):
row = []
row.append(num+1)
row.append(str(jsonobj['data']['list'][num]['返回结果属性名1']) + "")
row.append(str(jsonobj['data']['list'][num]['返回结果属性名'2]) + "")
row.append(str(jsonobj['data']['list'][num]['返回结果属性名3']) + "")
row.append(str(jsonobj['data']['list'][num]['返回结果属性名4']) + "")
row.append(str(jsonobj['data']['list'][num]['返回结果属性名5']) + "")
row.append(str(jsonobj['data']['list'][num]['返回结果属性名6']) + "")
row.append(str(jsonobj['data']['list'][num]['返回结果属性名7']) + "")
row.append(str(jsonobj['data']['list'][num]['返回结果属性名8']) + "")
rows.append(row)
n = n+1
# 生成Excel表头
header = ['列1', '列2', '列3', '列4', '列5', '列6', '列7', '列8', '列9']
# 将表头数据和爬虫数据导出到Excel文件
with open('D:\\xxx.csv', 'w', encoding='gb18030', newline='') as f :
f_csv = csv.writer(f)
f_csv.writerow(header)
f_csv.writerows(rows)
此处评论已关闭