当前位置: 首页 > news >正文

爬虫练习_01


前言

基础爬虫小练习01

一、requests板块使用

demo_01

import requests
from lxml import etreeurl = "https://movie.douban.com/top250"
headers = {"authority": "movie.douban.com","method": "GET","path": "/top250","scheme": "https","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; _pk_ses.100001.4cf6=1; __utma=30149280.1388041158.1723366816.1723366816.1723366816.1; __utmb=30149280.0.10.1723366816; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.165463845.1723366816.1723366816.1723366816.1; __utmb=223695111.0.10.1723366816; __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0; __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9","priority": "u=0, i","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "none","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp = requests.get(url=url, headers=headers)page = etree.HTML(resp.text)
rts = page.xpath("//ol[@class='grid_view']/li/div[@class='item']")for rt in rts:title = rt.xpath(".//span[@class='title']/text()")[0]score = rt.xpath(".//span[@class='rating_num']/text()")[0]print(title, score)

demo_02

# https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=
import requests
import lxmlurl = "https://movie.douban.com/j/chart/top_list"
headers = {"authority": "movie.douban.com","method": "GET","path": "/j/chart/top_list?type=13&interval_id=100%3A90&action=&start=0&limit=20","scheme": "https","accept": "*/*","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; _pk_ses.100001.4cf6=1; __utma=30149280.1388041158.1723366816.1723366816.1723366816.1; __utmb=30149280.0.10.1723366816; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.165463845.1723366816.1723366816.1723366816.1; __utmb=223695111.0.10.1723366816; __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0; __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9; ll=\"108296\"; _vwo_uuid_v2=DA8FF1B0948EE9728736018FE9DFF12E8|1eb76b564769076007eeb2dd472eae01","priority": "u=1, i","referer": "https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-origin","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","x-requested-with": "XMLHttpRequest","Enhanced": "'Never pause here'","The": "network throttling presets are updated with fast and slow 4G.","New": "scroll snap event listeners","Updated": "network throttling presets"
}
my_params = {"type": "13","interval_id": "100:90","action": "","start": "0","limit": "20"
}page = requests.get(url=url, params=my_params, headers=headers)
# print(page.text)
# print(page.request.url)
print(page.json())

demo_03

# https://www.iciba.com/translateimport requestsurl = "https://ifanyi.iciba.com/index.php"
my_headers = {"authority": "ifanyi.iciba.com","method": "POST","path": "/index.php?c=trans&m=fy&client=6&auth_user=key_web_new_fanyi&sign=SioZA5jWOFlUevETjhAhp9RriqVIAJSQ%2BxmfU0q7dIE%3D","scheme": "https","accept": "application/json, text/plain, */*","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","content-length": "34","content-type": "application/x-www-form-urlencoded","origin": "https://www.iciba.com","priority": "u=1, i","referer": "https://www.iciba.com/","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-site","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","Enhanced": "'Never pause here'","The": "network throttling presets are updated with fast and slow 4G.","New": "scroll snap event listeners","Updated": "network throttling presets"
}my_param = {"c": "trans","m": "fy","client": "6","auth_user": "key_web_new_fanyi","sign": "SioZA5jWOFlUevETjhAhp9RriqVIAJSQ+xmfU0q7dIE="
}form_data = {"from": "auto","to": "auto","q": "i love you"
}resp = requests.post(url, params=my_param, data=form_data, headers=my_headers)
print(resp.json())

demo_04

# 下载一张图片
import requestsurl = "https://img.yituyu.com/gallery/8234/00_llcUYWmo.jpg"headers = {"accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","connection": "keep-alive","cookie": "yituyu_first_time=1723381635000; yituyu_os=Windows%20NT%2010.0%3B%20Win64%3B%20x64; Hm_lvt_9714eb07ec1e2c497aefe3d4dfded3ed=1723381639; HMACCOUNT=211F43F3C6950EDF; Hm_lpvt_9714eb07ec1e2c497aefe3d4dfded3ed=1723381807","host": "img.yituyu.com","if-modified-since": "Sat, 15 Apr 2023 14:17:40 GMT","if-none-match": "\"900C5188548CC7C8CBAD301A215FA0D0\"","referer": "https://www.yituyu.com/","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "image","sec-fetch-mode": "no-cors","sec-fetch-site": "same-site","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp = requests.get(url, headers=headers)
print(resp.content)with open("tu.jpg", mode="wb") as f:f.write(resp.content)

demo_05

# https://movie.douban.com/review/best/
import requests
from lxml import etree
import reurl = "https://movie.douban.com/review/best/"
my_headers = {"authority": "movie.douban.com","method": "GET","path": "/review/best/","scheme": "https","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cache-control": "max-age=0","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9; ll=\"108296\"; _vwo_uuid_v2=DA8FF1B0948EE9728736018FE9DFF12E8|1eb76b564769076007eeb2dd472eae01; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1388041158.1723366816.1723366816.1723382928.2; __utmb=30149280.0.10.1723382928; __utma=223695111.165463845.1723366816.1723366816.1723382928.2; __utmb=223695111.0.10.1723382928","priority": "u=0, i","referer": "https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "same-origin","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp_1 = requests.get(url, headers=my_headers)# print(resp_1.text)
page = etree.HTML(resp_1.text)
rts = page.xpath("//div[@data-cid]")
print(len(rts))for rt in rts:title = rt.xpath(".//div[@class='main-bd']/h2/a/text()")[0]rid = rt.xpath(".//div[@class='review-short']/@data-rid")[0]# print(title, rid)full_url = f"https://movie.douban.com/j/review/{rid}/full"full_resp = requests.get(full_url, headers=my_headers)body = full_resp.json()["body"]body_page = etree.HTML(body)content = body_page.xpath("//div[@class='review-content clearfix']//text()")content = "".join(content)content = re.sub(r"\s", "", content)print(title)print(content)print("===================================")

总结

request 板块练习01

相关文章:

爬虫练习_01

前言 基础爬虫小练习01 一、requests板块使用 demo_01 import requests from lxml import etreeurl "https://movie.douban.com/top250" headers {"authority": "movie.douban.com","method": "GET","path"…...

Datawhale X 魔搭 AI夏令营第四期 魔搭-AIGC方向 task02笔记

从零入门AI生图原理&实践 是 Datawhale 2024 年 AI 夏令营第四期的学习活动(“AIGC”方向),基于魔搭社区“可图Kolors-LoRA风格故事挑战赛”开展的实践学习。 Datawhale官方的Task2链接:Task02 往期Task1链接:Ta…...

多模态大语言模型的免训练视觉提示学习 ControlMLLM

ControlMLLM: Training-Free Visual Prompt Learning for Multimodal Large Language Models github paper 在本研究中,提出了一种无需进行训练的方法,通过可学习的潜变量优化将视觉提示注入到多模态大型语言模型(MLLMs)中。 在…...

Oracle|DM 常用|不常用 SQL大口袋

目录 一、前言 二、SQL写法 1、sql获取某一条数据中的前一条和后一条 2、实现like多个值的查询(Oracle和dm支持,MySQL未试过) 3、start with connect by prior 使用方法 4、用hextoraw解决select、update、delete语句执行慢 5、ORA-00…...

嵌入式软件--模电基础 DAY 1

C语言的学习告一段落了,要多多注意复习回顾,温故而知新,学习的过程就是与遗忘作斗争。接下来就是嵌入式学习中硬件电路方面的知识了。 一、电学基础 1.电流 电流(Current)是电荷在单位时间内通过导体横截面的流动量…...

【Nacos无压力源码领读】(二) 集成 LoadBalancer 与 OpenFeign

上一篇文章中, 详细介绍了 Nacos 注册中心的原理, 相信看完后, 大家应该完全掌握了 Nacos 客户端是如何自动进行服务注册的, 以及 Nacos 客户端是如何订阅服务实例信息的, 以及 Nacos 服务器是如何处理客户端的注册和订阅请求的; 本文承上启下, 在订阅服务实例的基础上, 介绍如…...

《投资的原理》阅读笔记二——价值投资真是王者吗?

《投资的原理》的第二章是《史记货殖列传里的八大投资金句》,作者在这一章里宣扬的主要观点是价值投资才是稳妥的投资之路。但我觉得作者讲述的很多例子,包括经典的“两个金条放在一起,你告诉我那根是高尚的”,更多的应该体现在“…...

SSH、FTP、SFTP相关协议详解

一、SSH 1、定义 SSH(Secure Shell)是一种网络协议,用于加密方式远程登录到另一台计算机上,并执行命令或程序。SSH由IETF的网络小组(Network Working Group)所制定,是建立在应用层基础上的安全…...

C语言进阶——一文带你深度了解“C语言关键字”(中篇6)

本篇文章记录我学习C语言进阶知识——C语言关键字,旨在记录分享,希望我的分享能带给你不一样的收获! 目录 一、return关键字 二、const 关键字也许该被替换为 readolny (一)、 const 修饰的只读变量 (二…...

自建极简Ethercat主站-第8章 FOE基础功能实现

文章目录 第8章 FOE8.1 FOE简介8.2 FOE 数据结构8.2.1 FOE帧格式8.2.2 FOE请求8.3 数据传输流程8.3.1 读流程8.3.2 写流程8.3.3 忙操作8.3.4 代码示例第8章 FOE 源码地址 8.1 FOE简介 ​ FOE(File Access over Ethercat),用于节点之间的文件传输。协议类似于TFTP协议,感觉…...

SQL Zoo 8.Using Null

以下数据均来自SQL Zoo 1.List the teachers who have NULL for their department.(列出所属部门为NULL的教师) select name from teacher where dept is null 2.Note the INNER JOIN misses the teachers with no department and the departments wit…...

LeetCode274. H 指数

题目链接: 274. H 指数 - 力扣(LeetCode) 思路分析:这个题目可以使用哈希表来以空间换时间,我们设置一个数组v来统计每一个对应的影响因子的文章出现的数量,遍历一遍后,v[i]表示影响因子为i的…...

概述:Dubbo、Nacos、 Zookeeper 等分布式服务协调与治理等技术

目录 1. Dubbo 2. Nacos 3. Zookeeper Dubbo、Nacos、Zookeeper 是分布式服务协调与治理领域中的关键技术,它们在微服务架构和分布式系统中扮演着重要角色。以下是对这些技术的详细介绍: 1. Dubbo 概述: Dubbo 是一个高性能、轻量级的开…...

【LINUX】小工具降耦合,全内核函数插入宏摸索测试中。。

这阵子把这个小工具对外的耦合度降了下, include/linux/printk_self.h r77683962/linux-6.9.0 - Gitee.comhttps://gitee.com/r77683962/linux-6.9.0/blob/master/include/linux/printk_self.h 这个用于初始化打印日志的级别和打印次数: void Param…...

24/8/12算法笔记 复习_线性回归

import numpy as np#导入包 X np.array([[1,1],[2,1]])#构造矩阵 y np.array([14,10])np.linalg.solve(X,y) #linalg是线性代数,用于求解线性方程AX b,solve计算线性代数回归问题X.T#转置 a X.T.dot(X)#矩阵乘法B np.linalg.inv(a)#求逆矩阵from sklearn.linea…...

Linux系统驱动(十四)输入子系统

文章目录 一、输入子系统(一)输入子系统框架结构(二)输入子系统的API 二、实现两个按键的驱动(一)实现思路(二)代码实现 一、输入子系统 在linux系统中使用输入子系统驱动上报鼠标&…...

力扣(2024.08.12)

1. 98:验证二叉搜索树 # Definition for a binary tree node. # class TreeNode: # def __init__(self, val0, leftNone, rightNone): # self.val val # self.left left # self.right right class Solution:def isValidBST(self, r…...

最新版的AutoGPT,我搭建好了

最近AutoGPT不是更新了嘛 安装 我按照官方的教程 在本地搭建好了 改动 可见的改动,主要是把原来的纯命令行改成前后端的形式 看下前端界面 界面比较简单,主要分3个大块 监控 第一个是监控 主要是看你在 build 里构建的Agents的运行情况 build 第一个是Ag…...

[SWPUCTF 2021 新生赛]PseudoProtocols(构造伪协议)

打开题目所给的环境我们可以看到这样一句话: 这里我先尝试访问/hint.php ,但是发现什么都没有发生, F12查看源代码也并没有发现什么,到这里来看的话似乎没有思路了,但是这个题的题目已经给了我们很明显的提示&#xff…...

基于STM32开发的智能语音助手系统

目录 引言环境准备工作 硬件准备软件安装与配置系统设计 系统架构硬件连接代码实现 初始化代码控制代码应用场景 智能家居控制个人语音助理常见问题及解决方案 常见问题解决方案结论 1. 引言 随着人工智能技术的发展,智能语音助手已经逐渐进入了人们的日常生活。…...

Chapter03-Authentication vulnerabilities

文章目录 1. 身份验证简介1.1 What is authentication1.2 difference between authentication and authorization1.3 身份验证机制失效的原因1.4 身份验证机制失效的影响 2. 基于登录功能的漏洞2.1 密码爆破2.2 用户名枚举2.3 有缺陷的暴力破解防护2.3.1 如果用户登录尝试失败次…...

装饰模式(Decorator Pattern)重构java邮件发奖系统实战

前言 现在我们有个如下的需求,设计一个邮件发奖的小系统, 需求 1.数据验证 → 2. 敏感信息加密 → 3. 日志记录 → 4. 实际发送邮件 装饰器模式(Decorator Pattern)允许向一个现有的对象添加新的功能,同时又不改变其…...

YSYX学习记录(八)

C语言&#xff0c;练习0&#xff1a; 先创建一个文件夹&#xff0c;我用的是物理机&#xff1a; 安装build-essential 练习1&#xff1a; 我注释掉了 #include <stdio.h> 出现下面错误 在你的文本编辑器中打开ex1文件&#xff0c;随机修改或删除一部分&#xff0c;之后…...

将对透视变换后的图像使用Otsu进行阈值化,来分离黑色和白色像素。这句话中的Otsu是什么意思?

Otsu 是一种自动阈值化方法&#xff0c;用于将图像分割为前景和背景。它通过最小化图像的类内方差或等价地最大化类间方差来选择最佳阈值。这种方法特别适用于图像的二值化处理&#xff0c;能够自动确定一个阈值&#xff0c;将图像中的像素分为黑色和白色两类。 Otsu 方法的原…...

TRS收益互换:跨境资本流动的金融创新工具与系统化解决方案

一、TRS收益互换的本质与业务逻辑 &#xff08;一&#xff09;概念解析 TRS&#xff08;Total Return Swap&#xff09;收益互换是一种金融衍生工具&#xff0c;指交易双方约定在未来一定期限内&#xff0c;基于特定资产或指数的表现进行现金流交换的协议。其核心特征包括&am…...

【git】把本地更改提交远程新分支feature_g

创建并切换新分支 git checkout -b feature_g 添加并提交更改 git add . git commit -m “实现图片上传功能” 推送到远程 git push -u origin feature_g...

QT: `long long` 类型转换为 `QString` 2025.6.5

在 Qt 中&#xff0c;将 long long 类型转换为 QString 可以通过以下两种常用方法实现&#xff1a; 方法 1&#xff1a;使用 QString::number() 直接调用 QString 的静态方法 number()&#xff0c;将数值转换为字符串&#xff1a; long long value 1234567890123456789LL; …...

智能仓储的未来:自动化、AI与数据分析如何重塑物流中心

当仓库学会“思考”&#xff0c;物流的终极形态正在诞生 想象这样的场景&#xff1a; 凌晨3点&#xff0c;某物流中心灯火通明却空无一人。AGV机器人集群根据实时订单动态规划路径&#xff1b;AI视觉系统在0.1秒内扫描包裹信息&#xff1b;数字孪生平台正模拟次日峰值流量压力…...

均衡后的SNRSINR

本文主要摘自参考文献中的前两篇&#xff0c;相关文献中经常会出现MIMO检测后的SINR不过一直没有找到相关数学推到过程&#xff0c;其中文献[1]中给出了相关原理在此仅做记录。 1. 系统模型 复信道模型 n t n_t nt​ 根发送天线&#xff0c; n r n_r nr​ 根接收天线的 MIMO 系…...

算法:模拟

1.替换所有的问号 1576. 替换所有的问号 - 力扣&#xff08;LeetCode&#xff09; ​遍历字符串​&#xff1a;通过外层循环逐一检查每个字符。​遇到 ? 时处理​&#xff1a; 内层循环遍历小写字母&#xff08;a 到 z&#xff09;。对每个字母检查是否满足&#xff1a; ​与…...