当前位置: 首页 > news >正文

爬虫练习_01


前言

基础爬虫小练习01

一、requests板块使用

demo_01

import requests
from lxml import etreeurl = "https://movie.douban.com/top250"
headers = {"authority": "movie.douban.com","method": "GET","path": "/top250","scheme": "https","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; _pk_ses.100001.4cf6=1; __utma=30149280.1388041158.1723366816.1723366816.1723366816.1; __utmb=30149280.0.10.1723366816; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.165463845.1723366816.1723366816.1723366816.1; __utmb=223695111.0.10.1723366816; __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0; __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9","priority": "u=0, i","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "none","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp = requests.get(url=url, headers=headers)page = etree.HTML(resp.text)
rts = page.xpath("//ol[@class='grid_view']/li/div[@class='item']")for rt in rts:title = rt.xpath(".//span[@class='title']/text()")[0]score = rt.xpath(".//span[@class='rating_num']/text()")[0]print(title, score)

demo_02

# https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=
import requests
import lxmlurl = "https://movie.douban.com/j/chart/top_list"
headers = {"authority": "movie.douban.com","method": "GET","path": "/j/chart/top_list?type=13&interval_id=100%3A90&action=&start=0&limit=20","scheme": "https","accept": "*/*","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; _pk_ses.100001.4cf6=1; __utma=30149280.1388041158.1723366816.1723366816.1723366816.1; __utmb=30149280.0.10.1723366816; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.165463845.1723366816.1723366816.1723366816.1; __utmb=223695111.0.10.1723366816; __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0; __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9; ll=\"108296\"; _vwo_uuid_v2=DA8FF1B0948EE9728736018FE9DFF12E8|1eb76b564769076007eeb2dd472eae01","priority": "u=1, i","referer": "https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-origin","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","x-requested-with": "XMLHttpRequest","Enhanced": "'Never pause here'","The": "network throttling presets are updated with fast and slow 4G.","New": "scroll snap event listeners","Updated": "network throttling presets"
}
my_params = {"type": "13","interval_id": "100:90","action": "","start": "0","limit": "20"
}page = requests.get(url=url, params=my_params, headers=headers)
# print(page.text)
# print(page.request.url)
print(page.json())

demo_03

# https://www.iciba.com/translateimport requestsurl = "https://ifanyi.iciba.com/index.php"
my_headers = {"authority": "ifanyi.iciba.com","method": "POST","path": "/index.php?c=trans&m=fy&client=6&auth_user=key_web_new_fanyi&sign=SioZA5jWOFlUevETjhAhp9RriqVIAJSQ%2BxmfU0q7dIE%3D","scheme": "https","accept": "application/json, text/plain, */*","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","content-length": "34","content-type": "application/x-www-form-urlencoded","origin": "https://www.iciba.com","priority": "u=1, i","referer": "https://www.iciba.com/","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "empty","sec-fetch-mode": "cors","sec-fetch-site": "same-site","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","Enhanced": "'Never pause here'","The": "network throttling presets are updated with fast and slow 4G.","New": "scroll snap event listeners","Updated": "network throttling presets"
}my_param = {"c": "trans","m": "fy","client": "6","auth_user": "key_web_new_fanyi","sign": "SioZA5jWOFlUevETjhAhp9RriqVIAJSQ+xmfU0q7dIE="
}form_data = {"from": "auto","to": "auto","q": "i love you"
}resp = requests.post(url, params=my_param, data=form_data, headers=my_headers)
print(resp.json())

demo_04

# 下载一张图片
import requestsurl = "https://img.yituyu.com/gallery/8234/00_llcUYWmo.jpg"headers = {"accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","connection": "keep-alive","cookie": "yituyu_first_time=1723381635000; yituyu_os=Windows%20NT%2010.0%3B%20Win64%3B%20x64; Hm_lvt_9714eb07ec1e2c497aefe3d4dfded3ed=1723381639; HMACCOUNT=211F43F3C6950EDF; Hm_lpvt_9714eb07ec1e2c497aefe3d4dfded3ed=1723381807","host": "img.yituyu.com","if-modified-since": "Sat, 15 Apr 2023 14:17:40 GMT","if-none-match": "\"900C5188548CC7C8CBAD301A215FA0D0\"","referer": "https://www.yituyu.com/","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "image","sec-fetch-mode": "no-cors","sec-fetch-site": "same-site","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp = requests.get(url, headers=headers)
print(resp.content)with open("tu.jpg", mode="wb") as f:f.write(resp.content)

demo_05

# https://movie.douban.com/review/best/
import requests
from lxml import etree
import reurl = "https://movie.douban.com/review/best/"
my_headers = {"authority": "movie.douban.com","method": "GET","path": "/review/best/","scheme": "https","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-encoding": "gzip, deflate, br, zstd","accept-language": "zh-CN,zh;q=0.9","cache-control": "max-age=0","cookie": "bid=cqDgOHLreU0; _pk_id.100001.4cf6=a48c68336a0dbbc8.1723366816.; __utmc=30149280; __utmz=30149280.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmc=223695111; __utmz=223695111.1723366816.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=RjXMEHOKLnUzT6aAzCxf17A5kcLgeeL9; ll=\"108296\"; _vwo_uuid_v2=DA8FF1B0948EE9728736018FE9DFF12E8|1eb76b564769076007eeb2dd472eae01; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1388041158.1723366816.1723366816.1723382928.2; __utmb=30149280.0.10.1723382928; __utma=223695111.165463845.1723366816.1723366816.1723382928.2; __utmb=223695111.0.10.1723382928","priority": "u=0, i","referer": "https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=","sec-ch-ua": "\"Not)A;Brand\";v=\"99\", \"Google Chrome\";v=\"127\", \"Chromium\";v=\"127\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "same-origin","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}resp_1 = requests.get(url, headers=my_headers)# print(resp_1.text)
page = etree.HTML(resp_1.text)
rts = page.xpath("//div[@data-cid]")
print(len(rts))for rt in rts:title = rt.xpath(".//div[@class='main-bd']/h2/a/text()")[0]rid = rt.xpath(".//div[@class='review-short']/@data-rid")[0]# print(title, rid)full_url = f"https://movie.douban.com/j/review/{rid}/full"full_resp = requests.get(full_url, headers=my_headers)body = full_resp.json()["body"]body_page = etree.HTML(body)content = body_page.xpath("//div[@class='review-content clearfix']//text()")content = "".join(content)content = re.sub(r"\s", "", content)print(title)print(content)print("===================================")

总结

request 板块练习01

相关文章:

爬虫练习_01

前言 基础爬虫小练习01 一、requests板块使用 demo_01 import requests from lxml import etreeurl "https://movie.douban.com/top250" headers {"authority": "movie.douban.com","method": "GET","path"…...

Datawhale X 魔搭 AI夏令营第四期 魔搭-AIGC方向 task02笔记

从零入门AI生图原理&实践 是 Datawhale 2024 年 AI 夏令营第四期的学习活动(“AIGC”方向),基于魔搭社区“可图Kolors-LoRA风格故事挑战赛”开展的实践学习。 Datawhale官方的Task2链接:Task02 往期Task1链接:Ta…...

多模态大语言模型的免训练视觉提示学习 ControlMLLM

ControlMLLM: Training-Free Visual Prompt Learning for Multimodal Large Language Models github paper 在本研究中,提出了一种无需进行训练的方法,通过可学习的潜变量优化将视觉提示注入到多模态大型语言模型(MLLMs)中。 在…...

Oracle|DM 常用|不常用 SQL大口袋

目录 一、前言 二、SQL写法 1、sql获取某一条数据中的前一条和后一条 2、实现like多个值的查询(Oracle和dm支持,MySQL未试过) 3、start with connect by prior 使用方法 4、用hextoraw解决select、update、delete语句执行慢 5、ORA-00…...

嵌入式软件--模电基础 DAY 1

C语言的学习告一段落了,要多多注意复习回顾,温故而知新,学习的过程就是与遗忘作斗争。接下来就是嵌入式学习中硬件电路方面的知识了。 一、电学基础 1.电流 电流(Current)是电荷在单位时间内通过导体横截面的流动量…...

【Nacos无压力源码领读】(二) 集成 LoadBalancer 与 OpenFeign

上一篇文章中, 详细介绍了 Nacos 注册中心的原理, 相信看完后, 大家应该完全掌握了 Nacos 客户端是如何自动进行服务注册的, 以及 Nacos 客户端是如何订阅服务实例信息的, 以及 Nacos 服务器是如何处理客户端的注册和订阅请求的; 本文承上启下, 在订阅服务实例的基础上, 介绍如…...

《投资的原理》阅读笔记二——价值投资真是王者吗?

《投资的原理》的第二章是《史记货殖列传里的八大投资金句》,作者在这一章里宣扬的主要观点是价值投资才是稳妥的投资之路。但我觉得作者讲述的很多例子,包括经典的“两个金条放在一起,你告诉我那根是高尚的”,更多的应该体现在“…...

SSH、FTP、SFTP相关协议详解

一、SSH 1、定义 SSH(Secure Shell)是一种网络协议,用于加密方式远程登录到另一台计算机上,并执行命令或程序。SSH由IETF的网络小组(Network Working Group)所制定,是建立在应用层基础上的安全…...

C语言进阶——一文带你深度了解“C语言关键字”(中篇6)

本篇文章记录我学习C语言进阶知识——C语言关键字,旨在记录分享,希望我的分享能带给你不一样的收获! 目录 一、return关键字 二、const 关键字也许该被替换为 readolny (一)、 const 修饰的只读变量 (二…...

自建极简Ethercat主站-第8章 FOE基础功能实现

文章目录 第8章 FOE8.1 FOE简介8.2 FOE 数据结构8.2.1 FOE帧格式8.2.2 FOE请求8.3 数据传输流程8.3.1 读流程8.3.2 写流程8.3.3 忙操作8.3.4 代码示例第8章 FOE 源码地址 8.1 FOE简介 ​ FOE(File Access over Ethercat),用于节点之间的文件传输。协议类似于TFTP协议,感觉…...

SQL Zoo 8.Using Null

以下数据均来自SQL Zoo 1.List the teachers who have NULL for their department.(列出所属部门为NULL的教师) select name from teacher where dept is null 2.Note the INNER JOIN misses the teachers with no department and the departments wit…...

LeetCode274. H 指数

题目链接: 274. H 指数 - 力扣(LeetCode) 思路分析:这个题目可以使用哈希表来以空间换时间,我们设置一个数组v来统计每一个对应的影响因子的文章出现的数量,遍历一遍后,v[i]表示影响因子为i的…...

概述:Dubbo、Nacos、 Zookeeper 等分布式服务协调与治理等技术

目录 1. Dubbo 2. Nacos 3. Zookeeper Dubbo、Nacos、Zookeeper 是分布式服务协调与治理领域中的关键技术,它们在微服务架构和分布式系统中扮演着重要角色。以下是对这些技术的详细介绍: 1. Dubbo 概述: Dubbo 是一个高性能、轻量级的开…...

【LINUX】小工具降耦合,全内核函数插入宏摸索测试中。。

这阵子把这个小工具对外的耦合度降了下, include/linux/printk_self.h r77683962/linux-6.9.0 - Gitee.comhttps://gitee.com/r77683962/linux-6.9.0/blob/master/include/linux/printk_self.h 这个用于初始化打印日志的级别和打印次数: void Param…...

24/8/12算法笔记 复习_线性回归

import numpy as np#导入包 X np.array([[1,1],[2,1]])#构造矩阵 y np.array([14,10])np.linalg.solve(X,y) #linalg是线性代数,用于求解线性方程AX b,solve计算线性代数回归问题X.T#转置 a X.T.dot(X)#矩阵乘法B np.linalg.inv(a)#求逆矩阵from sklearn.linea…...

Linux系统驱动(十四)输入子系统

文章目录 一、输入子系统(一)输入子系统框架结构(二)输入子系统的API 二、实现两个按键的驱动(一)实现思路(二)代码实现 一、输入子系统 在linux系统中使用输入子系统驱动上报鼠标&…...

力扣(2024.08.12)

1. 98:验证二叉搜索树 # Definition for a binary tree node. # class TreeNode: # def __init__(self, val0, leftNone, rightNone): # self.val val # self.left left # self.right right class Solution:def isValidBST(self, r…...

最新版的AutoGPT,我搭建好了

最近AutoGPT不是更新了嘛 安装 我按照官方的教程 在本地搭建好了 改动 可见的改动,主要是把原来的纯命令行改成前后端的形式 看下前端界面 界面比较简单,主要分3个大块 监控 第一个是监控 主要是看你在 build 里构建的Agents的运行情况 build 第一个是Ag…...

[SWPUCTF 2021 新生赛]PseudoProtocols(构造伪协议)

打开题目所给的环境我们可以看到这样一句话: 这里我先尝试访问/hint.php ,但是发现什么都没有发生, F12查看源代码也并没有发现什么,到这里来看的话似乎没有思路了,但是这个题的题目已经给了我们很明显的提示&#xff…...

基于STM32开发的智能语音助手系统

目录 引言环境准备工作 硬件准备软件安装与配置系统设计 系统架构硬件连接代码实现 初始化代码控制代码应用场景 智能家居控制个人语音助理常见问题及解决方案 常见问题解决方案结论 1. 引言 随着人工智能技术的发展,智能语音助手已经逐渐进入了人们的日常生活。…...

FairyGUI Unity鼠标悬停与点击对象获取原理与实战

1. 这不是“加个OnMouseEnter就能用”的事:FairyGUI在Unity中处理鼠标交互的真实困境很多人第一次在Unity里集成FairyGUI,想实现“鼠标悬停显示提示”或“点击高亮当前按钮”,下意识就去翻Unity的MonoBehaviour文档,找OnMouseEnte…...

5个必知的Universal-Updater高级功能:从QR扫描到后台安装

5个必知的Universal-Updater高级功能:从QR扫描到后台安装 【免费下载链接】Universal-Updater An easy to use app for installing and updating 3DS homebrew 项目地址: https://gitcode.com/gh_mirrors/un/Universal-Updater Universal-Updater是一款专为任…...

高精度光照检测

光线检测仪,kotlin开发,调用手机感光模块检测室内外光照强度,用途多多,我主要用途孩子写作业检测光照保护视力。 食用方法∶打开即测,速度快,无广告,手机平视即可,无须直视光线。 买…...

AutoPentest:面向红队的渗透测试决策引擎架构解析

1. 这不是又一个“自动化扫描器”,而是一套能替你做决策的渗透测试工作流引擎AutoPentest这个名字,第一眼容易让人联想到Nmap加个for循环、或者Burp Suite里点几下Intruder——但实际用过的人很快会意识到:它根本不在同一个维度上。我第一次在…...

大模型测试新范式:Claude端到端验证的5层断言体系(语义一致性/上下文连贯性/安全边界/成本阈值/时序鲁棒性)

更多请点击: https://codechina.net 第一章:大模型测试新范式:Claude端到端验证的5层断言体系(语义一致性/上下文连贯性/安全边界/成本阈值/时序鲁棒性) 传统LLM测试常聚焦于准确率或BLEU等静态指标,而Cla…...

AI算法工程师如何进行数据预处理?这5个步骤让你的数据更优质

在AI模型开发与测试的全流程中,数据质量直接决定了最终模型的效果上限——哪怕是最先进的大语言模型,用劣质数据训练出来也只能输出劣质结果。对于软件测试从业者来说,不管是参与AI模型的功能测试、性能测试,还是负责测试数据集的…...

ComfyUI-WD14-Tagger:AI智能图像标签提取的终极完整指南

ComfyUI-WD14-Tagger:AI智能图像标签提取的终极完整指南 【免费下载链接】ComfyUI-WD14-Tagger A ComfyUI extension allowing for the interrogation of booru tags from images. 项目地址: https://gitcode.com/gh_mirrors/co/ComfyUI-WD14-Tagger 在AI图像…...

Playwright文件上传避坑指南:遇到动态生成的文件选择框怎么办?

Playwright文件上传避坑指南:动态生成文件选择框的实战解决方案最近在为一个电商平台做自动化测试时,遇到了一个棘手的问题——商品图片上传功能总是失败。页面上的"上传图片"按钮明明可以点击,但传统的set_input_files()方法却毫无…...

BurpSuite+SqlMap深度集成:构建高可信SQL注入检测流水线

1. 这不是“点几下就出结果”的玩具,而是你真正能放进渗透流程里的SQL注入检测流水线很多人第一次看到“BurpSuiteSqlMap插件5分钟搞定SQL注入检测”这个标题,第一反应是:又一个标题党?点开全是截图堆砌、参数照抄、报错就卡住的半…...

5步快速上手OpenVSP:免费开源的飞机参数化设计终极指南

5步快速上手OpenVSP:免费开源的飞机参数化设计终极指南 【免费下载链接】OpenVSP A parametric aircraft geometry tool 项目地址: https://gitcode.com/gh_mirrors/ope/OpenVSP OpenVSP是一款由NASA开发的免费开源飞机参数化设计工具,让航空工程…...