ZIP当当网排行榜爬虫+nosql数据分析 259.62KB

m0_63596031

资源文件列表:

当当网爬虫+nosql数据分析代码.zip 大约有3个文件
  1. Data Analysis.py 5.91KB
  2. spider.py 6.26KB
  3. 当当网近4年畅销图书榜单数据.xlsx 255.62KB

资源介绍:

爬取当当网排行榜并连接本地nosql数据库进行多维度数据分析
from lxml import etree import requests import random from pymongo import MongoClient import time import csv import pandas as pd import numpy as np import re import os import pymongo cookies = { 'ddscreen': '2', 'ddscreen': '2', 'dest_area': 'country_id%3D9000%26province_id%3D111%26city_id%20%3D0%26district_id%3D0%26town_id%3D0', '__permanent_id': '20240423210658530124490989268736883', 'MDD_channelId': '70000', 'MDD_fromPlatform': '307', 'ddscreen': '2', '__visit_id': '20240530154038979262380281306734049', '__out_refer': '', '__rpm': '...1717054859559%7C...1717054899777', '__trace_id': '20240530154142377181404279783243769', } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', # 'Cookie': 'ddscreen=2; ddscreen=2; dest_area=country_id%3D9000%26province_id%3D111%26city_id%20%3D0%26district_id%3D0%26town_id%3D0; __permanent_id=20240423210658530124490989268736883; MDD_channelId=70000; MDD_fromPlatform=307; ddscreen=2; __visit_id=20240530154038979262380281306734049; __out_refer=; __rpm=...1717054859559%7C...1717054899777; __trace_id=20240530154142377181404279783243769', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', } def SpiderData(url): response = requests.get( url, cookies=cookies, headers=headers, verify=False, ) page_text = response.text return page_text def ParseData(page_text,key): tree = etree.HTML(page_text) lis = tree.xpath('.//ul[@class="bang_list clearfix bang_list_mode"]/li') rank, name, comments, recommends, author, publish_date, publish_house, original_price, discount_price, discount, ebook_price = [[] for i in range(11)] for li in lis: rank.append(''.join(li.xpath('.//div[@class="list_num red" or @class="list_num "]/text()')).replace('.','')) name.append(''.join(li.xpath('.//div[@class="name"]/a/text()'))) comments.append(''.join(li.xpath('.//div[@class="star"]/a/text()')).split('条')[0]) recommends.append(''.join(li.xpath('.//div[@class="star"]/span/text()')).split('推荐')[0]) author.append(''.join(li.xpath('.//div[@class="publisher_info"][1]/a[1]/text()'))) publish_date.append(''.join(li.xpath('.//div[@class="publisher_info"][2]/span/text()'))) publish_house.append(''.join(li.xpath('.//div[@class="publisher_info"][2]/a/text()'))) original_price.append(''.join(li.xpath('.//div[@class="price"]/p[1]/span[1]/text()')).replace("¥", "")) discount_price.append(''.join(li.xpath('.//span[@class="price_r"]/text()')).replace("¥", "")) discount.append(''.join(li.xpath('.//span[@class="price_s"]/text()'))) ebook_price.append(''.join(li.xpath('./div[@class="price"]/p[@class="price_e"]/span[@class="price_n"]/text()')).replace("¥", "")) # print(original_price) #print(len(rank),len(name),len(comments),len(recommends),len(author),len(publish_date),len(publish_date),len(original_price),len(discount_price),len(discount)) # 保存数据 dic = { '排行榜类型':key, '排序':rank, '书名':name, '评论数':comments, '推荐值':recommends, '作者':author, '出版日期':publish_date, '出版社':publish_house, '原价':original_price, '折扣价':discount_price, '折扣比例':discount, '电子书价格':ebook_price } df1 = pd.DataFrame(dic) return df1 if __name__ == "__main__": # 创建空数据框 columns = ['排行榜类型', '排序', '书名', '评论数', '推荐值', '作者', '出版日期', '出版社', '原价','折扣价', '折扣比例', '电子书价格'] df = pd.DataFrame(columns=columns) book_rank_type = { "2020年": "year-2020-0-1", "2021年": "year-2021-0-1", "2022年": "year-2022-0-1", "2023年": "year-2023-0-1" } # 循环爬取 for key, value in book_rank_type.items(): print(f'=====================开始爬{key}榜单数据===================') for page in range(25): # 排行榜共有25页数据 print('*****************开始爬取第{}页数据*****************'.format(page+1)) url = f'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-{value}-{page+1}' #print(url) # try: time.sleep(1) data = SpiderData(url) df1 = ParseData(data, key) k = len(df) df = pd.concat([df,df1], axis=0) df.drop_duplicates(subset=None, keep='first', inplace=True) print('*********第{}页数据爬取完成,爬取{}条数据,目前共爬取{}条数据**********'.format(page+1,len(df)-k,len(df))) # except: # print('!!!!!!!!!第{}页数据爬取有误,需进行优化!!!!!!!!!'.format(page+1)) # break df = df.reset_index(drop=True) print(f'=================={key}榜单数据爬取完成,共有{len(df)}条数据===================') # # 连接到MongoDB,假设运行在默认端口上 # client = MongoClient('localhost', 27017) # # 选择数据库,如果不存在则会自动创建 # db = client['dangdang_4213'] # # 选择集合,如果不存在则会自动创建 # collection = db['dangdang_4213'] # # 将DataFrame转换为字典列表 # list_of_dicts = df.to_dict('records') # # 将数据插入MongoDB # collection.insert_many(list_of_dicts) # print(f'数据已成功存储到MongoDB,共{len(list_of_dicts)}条记录。') # # 关闭MongoDB连接 # client.close() # print(f'数据库连接已关闭') # print(df.shape) # 打印DataFrame的形状 # print(df.head()) # 打印DataFrame的前几行数据 df.to_excel('当当网近4年畅销图书榜单数据.xlsx',header=True,index=False)
100+评论
captcha
    类型标题大小时间
    ZIP2024年第五届“华数杯”全国大学生数学建模竞赛赛题.zip10.09MB8月前
    ZIPchromedriver_linux64.zip7.19MB8月前
    ZIP魔方优化大师 v6.2526.41MB8月前
    ZIP彩虹外链网盘界面UI美化版源码分享.zip1.41MB8月前
    ZIP当当网排行版爬虫+可视化分析266.43KB8月前
    ZIP智能车载终端叉车 车载人脸识别ADAS行为识别定制方案257.2KB8月前
    ZIP大麦抢票脚本项目代码.zip44.01KB8月前
    ZIP免费下载Springboot驾校预约管理系统1.12MB8月前