Scrapy
🐞

Scrapy

Tag
Scraping
Last Edited Time
Dec 20, 2021 08:55 PM
Created
Jul 26, 2020 10:56 PM

Study Resources

Urllib

在 Python 这个内置的 Urllib 库中

4 个模块

request

request模块是我们用的比较多的
就是用它来发起请求
所以我们重点说说这个模块
urllib.request.urlopen(url, data=None, [timeout, ]*)
urllib.request.Request(url, data=None, headers={}, method=None)

error

error模块呢,就是当我们在使用 request 模块遇到错了
就可以用它来进行异常处理

parse

parse模块就是用来解析我们的 URL 地址的,比如解析域名地址啦,URL指定的目录等

robotparser

这个用的就比较少了,它就是用来解析网站的 robot.txt

SSL

from urllib import request,parse
import ssl
context = ssl._create_unverified_context()

url = 'https://biihu.cc//account/ajax/login_process/'
headers = {
    #假装自己是浏览器
    'User-Agent':' Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
dict = {
    'return_url':'https://biihu.cc/',
    'user_name':'xiaoshuaib@gmail.com',
    'password':'123456789',
    '_post_type':'ajax',
}

# 我们把请求的参数转化为 byte
data = bytes(parse.urlencode(dict),'utf-8')
# 然后我们就可以封装 request 了
req = request.Request(url,data=data,headers=headers,method='POST')
# 最后我们进行请求
response = request.urlopen(req,context=context)
print(response.read().decode('utf-8'))
 

Requests

pip install requests
r = requests.get('https://api.github.com/events')
r = requests.post('https://httpbin.org/post', data = {'key':'value'})
r = requests.put('https://httpbin.org/put', data = {'key':'value'})
r = requests.delete('https://httpbin.org/delete')
r = requests.head('https://httpbin.org/get')
r = requests.options('https://httpbin.org/get')

# 携带请求参数
payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.get('https://httpbin.org/get', params=payload)

# 假装自己是浏览器
url = 'https://api.github.com/some/endpoint'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)

Get请求

# 获取服务器响应文本内容
>>> import requests
>>> r = requests.get('https://api.github.com/events')

>>> r.text
u'[{"repository":{"open_issues":0,"url":"https://github.com/...'
>>> r.encoding
'utf-8'

# 获取字节响应内容
>>> r.content
b'[{"repository":{"open_issues":0,"url":"https://github.com/...'

# 获取响应码
>>> r = requests.get('https://httpbin.org/get')
>>> r.status_code
200

# 获取响应头
>>> r.headers
{    
    'content-encoding': 'gzip',    
    'transfer-encoding': 'chunked',  
    'connection': 'close',    
    'server': 'nginx/1.0.4',    
    'x-runtime': '148ms',    
    'etag': '"e1ca502697e5c9317743dc078f67693f"',   
    'content-type': 'application/json'
}

# 获取 Json 响应内容
>>> import requests
>>> r = requests.get('https://api.github.com/events')
>>> r.json()
[{u'repository': {u'open_issues': 0, u'url': 'https://github.com/...'

# 获取 socket 流响应内容
>>> r = requests.get('https://api.github.com/events', stream=True)
>>> r.raw
<urllib3.response.HTTPResponse object at 0x101194810>
>>> r.raw.read(10)
'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03'

# 获取 cookie 信息
>>> url = 'http://example.com/some/cookie/setting/url'
>>> r = requests.get(url)
>>> r.cookies['example_cookie_name']
'example_cookie_value'

# 发送 cookie 信息
>>> url = 'https://httpbin.org/cookies'
>>> cookies = dict(cookies_are='working')
>>> r = requests.get(url, cookies=cookies)
>>> r.text
'{"cookies": {"cookies_are": "working"}}'

# 设置超时
>>> requests.get('https://github.com/', timeout=0.001)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>requests.exceptions.Timeout: HTTPConnectionPool(host='github.com', port=80): Request timed out. (timeout=0.001)

Post请求

# 一个键里面添加多个值
>>> payload_tuples = [('key1', 'value1'), ('key1', 'value2')]
>>> r1 = requests.post('https://httpbin.org/post', data=payload_tuples)
>>> payload_dict = {'key1': ['value1', 'value2']}
>>> r2 = requests.post('https://httpbin.org/post', data=payload_dict)
>>> print(r1.text)
{  ...  "form": {    "key1": [      "value1",      "value2"    ]  },  ...}
>>> r1.text == r2.text
True

# json
>>> url = 'https://api.github.com/some/endpoint'
>>> payload = {'some': 'data'}
>>> r = requests.post(url, json=payload)

# 上传文件
>>> url = 'https://httpbin.org/post'
>>> files = {'file': open('report.xls', 'rb')}
>>> r = requests.post(url, files=files)
>>> r.text
{  ...  "files": {    "file": "<censored...binary...data>"  },  ...}
 
 

填坑

如果请求是json格式的

header添加content-type并且用json格式封装
import requests
import json

# data = json.dumps({'mail':'jimschenchen@gmail.com',' ':'0602102000lll'})
headers = {'user-agent': 'my-app/0.0.1', 'content-type': 'application/json;charset=UTF-8'}
payload = {"mail": "jimschenchen@gmail.com", "password": "111"}

r = requests.put('http://utcupids.com/api/users/signin', headers = headers, data = json.dumps(payload))
print(r.content)
 

Regular Expression

notion image

Python Lib

import re

#第一个就是我们的匹配规则
# 第二个就是需要被过滤的内容
content = 'Xiaoshuaib has 100 bananas'
res = re.match('^Xi.*(\d+)\s.*s$',content)
print(res.group(1))

# 它会直接去扫描字符串
# 然后把匹配成功的第一个结果的返回给你
content = """Xiaoshuaib has 100 
bananas"""
res = re.search('Xi.*?(\d+)\s.*s',content,re.S)
print(res.group(1))

# re.findall
# 通过它我们就能轻松的获取所有匹配的内容了
content = """Xiaoshuaib has 100 bananas;
Xiaoshuaib has 100 bananas;
Xiaoshuaib has 100 bananas;
Xiaoshuaib has 100 bananas;"""
res = re.findall('Xi.*?(\d+)\s.*?s;',content,re.S)
print(res)

# 直接替换匹配的内容呢
content = """Xiaoshuaib has 100 bananas;
Xiaoshuaib has 100 bananas;
Xiaoshuaib has 100 bananas;
Xiaoshuaib has 100 bananas;"""
content = re.sub('\d+','250',content)
print(content)

#re.compile
# 这个主要就是把我们的匹配符封装一下
content = "Xiaoshuaib has 100 bananas"
pattern = re.compile('Xi.*?(\d+)\s.*s',re.S)
res = re.match(pattern,content)

print(res.group(1))

# 其实和我们之前写的一样的
# 只不过 compile 一下
# 便于以后复用
res = re.match('^Xi.*?(\d+)\s.*s$',content,re.S)
 

status_code = 418

#  请求带上头
url = "https://movie.douban.com/subject/26266893/reviews?start=120"
headers = {'User-Agent': ua.chrome}
r = requests.get(url,headers)
 
 

Selenium

Selenium support for PhantomJS has been deprecated

use headless Chrome
from selenium import webdriver
 
from selenium.webdriver.chrome.options import Options
 
 
 
chrome_options = Options()
 
chrome_options.add_argument('--headless')
 
chrome_options.add_argument('--disable-gpu')
 
driver = webdriver.Chrome(chrome_options=chrome_options)
 
driver.get("https://cnblogs.com/")
 

切换页面

currentHandle = browser.current_window_handle
        # 跳转到新的想要跳转的页面
        for handle in browser.window_handles:
            # 可以在新的页面中找到一些特有属性,作为判断依据
            if handle != currentHandle:
                browser.switch_to.window(handle)
                break
        print(browser.title)

复合css类

total = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "[class='page-item last'] > button")))
 
 

并行

一个单核的 CPU
它每次只能执行一个进程
那么如果是多核的 CPU 呢
是不是就可以同时执行多个进程了
这就是并行
在某一个时间段里,可以同时执行多个进程

并发

就是在一个时间点,同时执行多个进程
你想想
在双十一的时候
是不是在那个晚上的 12 点
很多人同时秒杀了某一个商品

互斥锁 GIL

所谓互斥锁
就是让线程乖一点
别他妈的乱来
一个一个线程安全有序的去执行
在 I/O 流阻塞的时候,GIL会被释放
 

多进程

如果是要处理多任务,想要充分的去利用我们多核的CPU
就开启多进程(multiprocessing)
而对于一些输入输出的多任务
就开启多线程(threading.Thread、multiprocessing.dummy)

协程/微线程

在一个线程里面可以执行多个函数
线程和进程是通过系统调度的
而微线程则不需要,可以根据需要自己调度
因为微线程是函数之间在切换
所以开销很小

Loading Comments...