0%

Python learning - Script

例:获取播放量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# -*- coding: UTF-8 -*-
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import time
import urllib
import re
from selenium import webdriver

#html = requests.get("https://www.bilibili.com/anime/index/#st=1&order=2&season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&sort=0&page=1")
#html = urlopen("https://www.bilibili.com/anime/index/#st=1&order=2&season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&sort=0&page=1")
#html.encoding = 'utf-8'

with open("test.csv", "w+") as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerow(["Name", "Viewers"])
for i in range(1,50):
option = webdriver.ChromeOptions()
# 设置不加载页面
option.add_argument("headless")
# 设置默认编码为 utf-8
option.add_argument('lang=zh_CN.UTF-8')
option.add_argument('--disable - plugins')
#option.add_argument('blink-settings=imagesEnabled=false')
#prefs = {"profile.managed_default_content_settings.images": 2}
#option.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(chrome_options=option)
html = "https://www.bilibili.com/anime/index/#st=1&order=2&season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&sort=0&page=" + str(i)
print(html)
html.encode('utf-8')
driver.get(html)
time.sleep(3)
driver.minimize_window()
bsObj = BeautifulSoup(driver.page_source.replace(' ', ' '),"lxml")
#AnimeList = bsObj.select("bangumi-item")
AnimeList = bsObj.findAll(class_="bangumi-item")
for ani in AnimeList:
writer.writerow([ani.find("a",class_="bangumi-title").get_text(),ani.find("div",class_="shadow").get_text()])
url = ani.find("div",class_="common-lazy-img").find("img")['src']
if len(url) < 2:
continue
reg = re.search(r'@(.*)',url).group()
url = url.replace('//','https://').replace(reg,'')
urllib.request.urlretrieve(url,"{0}.jpg".format(ani.find("a",class_="bangumi-title").get_text().replace('/','-')))
#with open("testpic.png","wb") as pic:
#pic.write(response.content)
driver.quit()
'''except:
print("报错退出")
finally:
driver.quit()'''
#for anime in AnimeList:
#print(anime.find("a",class_="bangumi-title").get_text())
'''taskkill /im chromedriver.exe /F
  taskkill /im chrome.exe /F'''

chrome Options

chromeOptions 是一个配置 chrome 启动是属性的类。通过这个类,我们可以为chrome配置如下参数:

设置 chrome 二进制文件位置 (binary_location)
添加启动参数 (add_argument)
添加扩展应用 (add_extension, add_encoded_extension)
添加实验性质的设置参数 (add_experimental_option)
设置调试器地址 (debugger_address)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# .\Lib\site-packages\selenium\webdriver\chrome\options.py
class Options(object):
def __init__(self):
# 设置 chrome 二进制文件位置
self._binary_location = ''
# 添加启动参数
self._arguments = []
# 添加扩展应用
self._extension_files = []
self._extensions = []
# 添加实验性质的设置参数
self._experimental_options = {}
# 设置调试器地址
self._debugger_address = None

常用配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from selenium import webdriver
options = webdriver.ChromeOptions()
# 设置默认编码为 utf-8
options.add_argument('lang=zh_CN.UTF-8')
# 禁止图片的加载
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)

options.add_argument('--proxy-server=http://ip:port')

proxyauth_plugin_path = create_proxyauth_extension(
proxy_host='host',
proxy_port='port',
proxy_username="username",
proxy_password="password"
)
options.add_extension(proxyauth_plugin_path)

options.add_argument('window-size=1920x3000') # 指定浏览器分辨率
options.add_argument('--disable-gpu') # 加上这个属性来规避bug
options.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示
options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" # 手动指定使用的浏览器位置
driver = webdriver.Chrome(chrome_options = options)