1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
|
import os import re import time import urllib from hashlib import md5 import requests from requests.exceptions import RequestException from selenium import webdriver from bs4 import BeautifulSoup import json from selenium.webdriver.common.keys import Keys from threading import Thread
def GetPageHtml(url): try: option = webdriver.ChromeOptions() driver = webdriver.Chrome(chrome_options=option) driver.get("https://www.bjjnts.cn/login") txt = driver.page_source driver.find_element_by_name("username").send_keys("370831199309165413") driver.find_element_by_name("password").send_keys("bj123465") driver.find_element_by_xpath("//button[@class='login_btn' and @type='submit']").click() header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36','Referer':url} time.sleep(2)
driver.get(url) return driver.page_source except RequestException: return None def ParseHtml(html): pattern = re.compile('<video src="(.*?)"\sposter=',re.S) items = re.search(pattern,html) print(items.group(1)) return items.group(1) def ParseVideoHtml(html, name): root = "D://Downloads//" path = root + name + ".mp4" try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(html) with open(path, 'wb') as f: f.write(r.content) f.close() print("文件保存成功") else: print("文件已存在") except: print("爬取失败") def read_config(): """"读取配置""" path = os.getcwd() if not os.path.exists(path): os.mkdir(path) if not os.path.isfile("config.json"): with open("config.json", mode="w", encoding="utf-8") as jsonfile: jsonfile.write(json.dump({'url':'https://www.bjjnts.cn/lessonStudy/202/4268'}),indent=4) jsonfile.close() with open("config.json") as json_file: config = json.load(json_file) return config def main(): config = read_config() html = config["url"] print(html) url = "https://www.bjjnts.cn/login" option = webdriver.ChromeOptions() driver = webdriver.Chrome(chrome_options=option) driver.get(url) txt = driver.page_source driver.find_element_by_name("username").send_keys("370831199309165413") driver.find_element_by_name("password").send_keys("bj123465") driver.find_element_by_xpath("//button[@class='login_btn' and @type='submit']").click() header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36','Referer':url} time.sleep(2) driver.get(html)
videoNames = [] bsObj = BeautifulSoup(driver.page_source.replace(' ', ' '), "lxml") VideoList = bsObj.findAll(class_="course_study_menubox") i = 1 for video in VideoList: videoname = video.find("h4",class_="course_study_menutitle").get_text() videoHtml = ParseHtml(driver.page_source) ParseVideoHtml(videoHtml, videoname) time.sleep(2) i += 1 driver.find_element_by_xpath("//a[@class='change_chapter lesson-" + str(i) + "']").find_element_by_xpath('..').click() time.sleep(2) print(str(i))
if __name__ == '__main__': main()
|