爬虫练习_01
前言
基础爬虫小练习01
一、requests板块使用
demo_01
import requests
from lxml import etreeurl = "https://movie.douban.com/top250"
headers = {"authority": "movie.douban.com","method": "GET","path": "/top250","scheme": "https","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; _pk_ses.100001.4cf6=1; __utma=30149280.1388041158.1723366816.1723366816.1723366816.1; __utmb=30149280.0.10.1723366816; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.165463845.1723366816.1723366816.1723366816.1; __utmb=223695111.0.10.1723366816; __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0; __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9","priority": "u=0, i","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "none","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp = requests.get(url=url, headers=headers)page = etree.HTML(resp.text)
rts = page.xpath("//ol[@class='grid_view']/li/div[@class='item']")for rt in rts:title = rt.xpath(".//span[@class='title']/text()")[0]score = rt.xpath(".//span[@class='rating_num']/text()")[0]print(title, score)
demo_02
# https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=
import requests
import lxmlurl = "https://movie.douban.com/j/chart/top_list"
headers = {"authority": "movie.douban.com","method": "GET","path": "/j/chart/top_list?type=13&interval_id=100%3A90&action=&start=0&limit=20","scheme": "https","accept": "*/*","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; _pk_ses.100001.4cf6=1; __utma=30149280.1388041158.1723366816.1723366816.1723366816.1; __utmb=30149280.0.10.1723366816; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.165463845.1723366816.1723366816.1723366816.1; __utmb=223695111.0.10.1723366816; __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0; __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9; ll=\"108296\"; _vwo_uuid_v2=DA8FF1B0948EE9728736018FE9DFF12E8|1eb76b564769076007eeb2dd472eae01","priority": "u=1, i","referer": "https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-origin","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","x-requested-with": "XMLHttpRequest","Enhanced": "'Never pause here'","The": "network throttling presets are updated with fast and slow 4G.","New": "scroll snap event listeners","Updated": "network throttling presets"
}
my_params = {"type": "13","interval_id": "100:90","action": "","start": "0","limit": "20"
}page = requests.get(url=url, params=my_params, headers=headers)
# print(page.text)
# print(page.request.url)
print(page.json())
demo_03
# https://www.iciba.com/translateimport requestsurl = "https://ifanyi.iciba.com/index.php"
my_headers = {"authority": "ifanyi.iciba.com","method": "POST","path": "/index.php?c=trans&m=fy&client=6&auth_user=key_web_new_fanyi&sign=SioZA5jWOFlUevETjhAhp9RriqVIAJSQ%2BxmfU0q7dIE%3D","scheme": "https","accept": "application/json, text/plain, */*","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","content-length": "34","content-type": "application/x-www-form-urlencoded","origin": "https://www.iciba.com","priority": "u=1, i","referer": "https://www.iciba.com/","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-site","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","Enhanced": "'Never pause here'","The": "network throttling presets are updated with fast and slow 4G.","New": "scroll snap event listeners","Updated": "network throttling presets"
}my_param = {"c": "trans","m": "fy","client": "6","auth_user": "key_web_new_fanyi","sign": "SioZA5jWOFlUevETjhAhp9RriqVIAJSQ+xmfU0q7dIE="
}form_data = {"from": "auto","to": "auto","q": "i love you"
}resp = requests.post(url, params=my_param, data=form_data, headers=my_headers)
print(resp.json())
demo_04
# 下载一张图片
import requestsurl = "https://img.yituyu.com/gallery/8234/00_llcUYWmo.jpg"headers = {"accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","connection": "keep-alive","cookie": "yituyu_first_time=1723381635000; yituyu_os=Windows%20NT%2010.0%3B%20Win64%3B%20x64; Hm_lvt_9714eb07ec1e2c497aefe3d4dfded3ed=1723381639; HMACCOUNT=211F43F3C6950EDF; Hm_lpvt_9714eb07ec1e2c497aefe3d4dfded3ed=1723381807","host": "img.yituyu.com","if-modified-since": "Sat, 15 Apr 2023 14:17:40 GMT","if-none-match": "\"900C5188548CC7C8CBAD301A215FA0D0\"","referer": "https://www.yituyu.com/","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "image","sec-fetch-mode": "no-cors","sec-fetch-site": "same-site","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp = requests.get(url, headers=headers)
print(resp.content)with open("tu.jpg", mode="wb") as f:f.write(resp.content)
demo_05
# https://movie.douban.com/review/best/
import requests
from lxml import etree
import reurl = "https://movie.douban.com/review/best/"
my_headers = {"authority": "movie.douban.com","method": "GET","path": "/review/best/","scheme": "https","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cache-control": "max-age=0","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9; ll=\"108296\"; _vwo_uuid_v2=DA8FF1B0948EE9728736018FE9DFF12E8|1eb76b564769076007eeb2dd472eae01; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1388041158.1723366816.1723366816.1723382928.2; __utmb=30149280.0.10.1723382928; __utma=223695111.165463845.1723366816.1723366816.1723382928.2; __utmb=223695111.0.10.1723382928","priority": "u=0, i","referer": "https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "same-origin","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp_1 = requests.get(url, headers=my_headers)# print(resp_1.text)
page = etree.HTML(resp_1.text)
rts = page.xpath("//div[@data-cid]")
print(len(rts))for rt in rts:title = rt.xpath(".//div[@class='main-bd']/h2/a/text()")[0]rid = rt.xpath(".//div[@class='review-short']/@data-rid")[0]# print(title, rid)full_url = f"https://movie.douban.com/j/review/{rid}/full"full_resp = requests.get(full_url, headers=my_headers)body = full_resp.json()["body"]body_page = etree.HTML(body)content = body_page.xpath("//div[@class='review-content clearfix']//text()")content = "".join(content)content = re.sub(r"\s", "", content)print(title)print(content)print("===================================")
总结
request 板块练习01
相关文章:
爬虫练习_01
前言 基础爬虫小练习01 一、requests板块使用 demo_01 import requests from lxml import etreeurl "https://movie.douban.com/top250" headers {"authority": "movie.douban.com","method": "GET","path"…...
Datawhale X 魔搭 AI夏令营第四期 魔搭-AIGC方向 task02笔记
从零入门AI生图原理&实践 是 Datawhale 2024 年 AI 夏令营第四期的学习活动(“AIGC”方向),基于魔搭社区“可图Kolors-LoRA风格故事挑战赛”开展的实践学习。 Datawhale官方的Task2链接:Task02 往期Task1链接:Ta…...
多模态大语言模型的免训练视觉提示学习 ControlMLLM
ControlMLLM: Training-Free Visual Prompt Learning for Multimodal Large Language Models github paper 在本研究中,提出了一种无需进行训练的方法,通过可学习的潜变量优化将视觉提示注入到多模态大型语言模型(MLLMs)中。 在…...
Oracle|DM 常用|不常用 SQL大口袋
目录 一、前言 二、SQL写法 1、sql获取某一条数据中的前一条和后一条 2、实现like多个值的查询(Oracle和dm支持,MySQL未试过) 3、start with connect by prior 使用方法 4、用hextoraw解决select、update、delete语句执行慢 5、ORA-00…...
嵌入式软件--模电基础 DAY 1
C语言的学习告一段落了,要多多注意复习回顾,温故而知新,学习的过程就是与遗忘作斗争。接下来就是嵌入式学习中硬件电路方面的知识了。 一、电学基础 1.电流 电流(Current)是电荷在单位时间内通过导体横截面的流动量…...
【Nacos无压力源码领读】(二) 集成 LoadBalancer 与 OpenFeign
上一篇文章中, 详细介绍了 Nacos 注册中心的原理, 相信看完后, 大家应该完全掌握了 Nacos 客户端是如何自动进行服务注册的, 以及 Nacos 客户端是如何订阅服务实例信息的, 以及 Nacos 服务器是如何处理客户端的注册和订阅请求的; 本文承上启下, 在订阅服务实例的基础上, 介绍如…...
《投资的原理》阅读笔记二——价值投资真是王者吗?
《投资的原理》的第二章是《史记货殖列传里的八大投资金句》,作者在这一章里宣扬的主要观点是价值投资才是稳妥的投资之路。但我觉得作者讲述的很多例子,包括经典的“两个金条放在一起,你告诉我那根是高尚的”,更多的应该体现在“…...
SSH、FTP、SFTP相关协议详解
一、SSH 1、定义 SSH(Secure Shell)是一种网络协议,用于加密方式远程登录到另一台计算机上,并执行命令或程序。SSH由IETF的网络小组(Network Working Group)所制定,是建立在应用层基础上的安全…...
C语言进阶——一文带你深度了解“C语言关键字”(中篇6)
本篇文章记录我学习C语言进阶知识——C语言关键字,旨在记录分享,希望我的分享能带给你不一样的收获! 目录 一、return关键字 二、const 关键字也许该被替换为 readolny (一)、 const 修饰的只读变量 (二…...
自建极简Ethercat主站-第8章 FOE基础功能实现
文章目录 第8章 FOE8.1 FOE简介8.2 FOE 数据结构8.2.1 FOE帧格式8.2.2 FOE请求8.3 数据传输流程8.3.1 读流程8.3.2 写流程8.3.3 忙操作8.3.4 代码示例第8章 FOE 源码地址 8.1 FOE简介 FOE(File Access over Ethercat),用于节点之间的文件传输。协议类似于TFTP协议,感觉…...
SQL Zoo 8.Using Null
以下数据均来自SQL Zoo 1.List the teachers who have NULL for their department.(列出所属部门为NULL的教师) select name from teacher where dept is null 2.Note the INNER JOIN misses the teachers with no department and the departments wit…...
LeetCode274. H 指数
题目链接: 274. H 指数 - 力扣(LeetCode) 思路分析:这个题目可以使用哈希表来以空间换时间,我们设置一个数组v来统计每一个对应的影响因子的文章出现的数量,遍历一遍后,v[i]表示影响因子为i的…...
概述:Dubbo、Nacos、 Zookeeper 等分布式服务协调与治理等技术
目录 1. Dubbo 2. Nacos 3. Zookeeper Dubbo、Nacos、Zookeeper 是分布式服务协调与治理领域中的关键技术,它们在微服务架构和分布式系统中扮演着重要角色。以下是对这些技术的详细介绍: 1. Dubbo 概述: Dubbo 是一个高性能、轻量级的开…...
【LINUX】小工具降耦合,全内核函数插入宏摸索测试中。。
这阵子把这个小工具对外的耦合度降了下, include/linux/printk_self.h r77683962/linux-6.9.0 - Gitee.comhttps://gitee.com/r77683962/linux-6.9.0/blob/master/include/linux/printk_self.h 这个用于初始化打印日志的级别和打印次数: void Param…...
24/8/12算法笔记 复习_线性回归
import numpy as np#导入包 X np.array([[1,1],[2,1]])#构造矩阵 y np.array([14,10])np.linalg.solve(X,y) #linalg是线性代数,用于求解线性方程AX b,solve计算线性代数回归问题X.T#转置 a X.T.dot(X)#矩阵乘法B np.linalg.inv(a)#求逆矩阵from sklearn.linea…...
Linux系统驱动(十四)输入子系统
文章目录 一、输入子系统(一)输入子系统框架结构(二)输入子系统的API 二、实现两个按键的驱动(一)实现思路(二)代码实现 一、输入子系统 在linux系统中使用输入子系统驱动上报鼠标&…...
力扣(2024.08.12)
1. 98:验证二叉搜索树 # Definition for a binary tree node. # class TreeNode: # def __init__(self, val0, leftNone, rightNone): # self.val val # self.left left # self.right right class Solution:def isValidBST(self, r…...
最新版的AutoGPT,我搭建好了
最近AutoGPT不是更新了嘛 安装 我按照官方的教程 在本地搭建好了 改动 可见的改动,主要是把原来的纯命令行改成前后端的形式 看下前端界面 界面比较简单,主要分3个大块 监控 第一个是监控 主要是看你在 build 里构建的Agents的运行情况 build 第一个是Ag…...
[SWPUCTF 2021 新生赛]PseudoProtocols(构造伪协议)
打开题目所给的环境我们可以看到这样一句话: 这里我先尝试访问/hint.php ,但是发现什么都没有发生, F12查看源代码也并没有发现什么,到这里来看的话似乎没有思路了,但是这个题的题目已经给了我们很明显的提示ÿ…...
基于STM32开发的智能语音助手系统
目录 引言环境准备工作 硬件准备软件安装与配置系统设计 系统架构硬件连接代码实现 初始化代码控制代码应用场景 智能家居控制个人语音助理常见问题及解决方案 常见问题解决方案结论 1. 引言 随着人工智能技术的发展,智能语音助手已经逐渐进入了人们的日常生活。…...
本地多人游戏分屏工具:突破单机限制的创新解决方案
本地多人游戏分屏工具:突破单机限制的创新解决方案 【免费下载链接】nucleuscoop Starts multiple instances of a game for split-screen multiplayer gaming! 项目地址: https://gitcode.com/gh_mirrors/nu/nucleuscoop 你是否曾遇到这样的困境:…...
YOLO系列算法改进 | 主干改进篇 | 替换QARepVGG量化感知重参数化网络 | 通过权重与激活分布的协同优化,在保持部署推理速度的同时解决INT8量化精度崩塌难题 | AAAI 2024
0. 前言 本文介绍QARepVGG量化感知重参数化网络,并将其集成到ultralytics最新发布的YOLOv26目标检测算法中,替换原有Backbone网络。QARepVGG通过重新设计RepVGG的多分支结构(移除Identity与11分支的BN层、在分支融合后添加后置BN),从根本上解决了重参数化网络在INT8量化时…...
XiaomiGateway3网络稳定性终极指南:WiFi设置、信道选择与干扰排除
XiaomiGateway3网络稳定性终极指南:WiFi设置、信道选择与干扰排除 【免费下载链接】XiaomiGateway3 Home Assistant custom component for control Xiaomi Multimode Gateway (aka Gateway 3), Xiaomi Multimode Gateway 2, Aqara Hub E1 on default firmwares over…...
Unity游戏实时翻译插件XUnity.AutoTranslator的完整技术解析与实战指南
Unity游戏实时翻译插件XUnity.AutoTranslator的完整技术解析与实战指南 【免费下载链接】XUnity.AutoTranslator 项目地址: https://gitcode.com/gh_mirrors/xu/XUnity.AutoTranslator XUnity.AutoTranslator是一个功能强大的Unity游戏自动翻译框架,为全球玩…...
Xilinx Aurora 8B/10B IP核(5):GT资源规划实战——从PCB引脚到IP核Lane的映射法则
1. 从PCB引脚到IP核Lane的映射挑战 刚接触Xilinx Aurora 8B/10B IP核配置时,最让我头疼的就是这个"物理到逻辑"的映射问题。记得第一次调试时,明明IP核配置界面显示链路已建立,但实际硬件就是无法通信,后来发现是Lane分…...
Binary Ninja:开源二进制逆向工程的Python解决方案
Binary Ninja:开源二进制逆向工程的Python解决方案 【免费下载链接】deprecated-binaryninja-python Deprecated Binary Ninja prototype written in Python 项目地址: https://gitcode.com/gh_mirrors/de/deprecated-binaryninja-python 你是否曾面对一个陌…...
告别VIM原生补全:用coc.nvim + Node.js打造媲美VSCode的智能开发环境
告别VIM原生补全:用coc.nvim Node.js打造媲美VSCode的智能开发环境 在编辑器领域,VIM以其高效的键盘操作和强大的定制能力赢得了无数开发者的青睐。然而,对于那些习惯了现代IDE如VSCode、IntelliJ的开发者来说,VIM原生的代码补全…...
从手动15秒到自动0.8秒:米哈游游戏扫码登录的智能革命
从手动15秒到自动0.8秒:米哈游游戏扫码登录的智能革命 【免费下载链接】MHY_Scanner MHY扫码登录器,支持从直播流抢码。 项目地址: https://gitcode.com/gh_mirrors/mh/MHY_Scanner 在直播抢码、多账号切换的激烈竞争中,你是否还在为手…...
“你用AI,那我也会用AI,我还要你干什么?”
这个代码的核心功能是:基于输入词的长度动态选择反义词示例,并调用大模型生成反义词,体现了 “动态少样本提示(Dynamic Few-Shot Prompting)” 与 “上下文长度感知的示例选择” 的能力。 from langchain.prompts impo…...
OpenClaw隐私保护机制:Qwen3.5-9B本地化处理法律文件
OpenClaw隐私保护机制:Qwen3.5-9B本地化处理法律文件 1. 为什么律师事务所需要本地化AI方案 上个月,我帮一家本地律所部署了OpenClawQwen3.5-9B的解决方案。他们的核心诉求很简单:处理客户合同时,既要实现自动化识别关键条款&am…...
