Chinaunix首页 | 论坛 | 博客
  • 博客访问: 3596081
  • 博文数量: 365
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 2522
  • 用 户 组: 普通用户
  • 注册时间: 2019-10-28 13:40
文章分类

全部博文(365)

文章存档

2023年(8)

2022年(130)

2021年(155)

2020年(50)

2019年(22)

我的朋友

分类: Python/Ruby

2021-07-02 17:29:52

# coding: utf-8

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.common.action_chains import ActionChains

from selenium.common.exceptions import *

import requests

from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

requests.adapters.DEFAULT_RETRIES = 5

import time

import os

import re

driver = webdriver.Chrome()

# driver = webdriver.FireFox()

wait = WebDriverWait(driver, 10)

def download(url, file_name):

    headers = {

        'Host': 'hubble.netease.com',

        'Origin': '',

        'Referer': url.split("#")[0],

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'

    }

    if not os.path.exists(file_name) or os.path.getsize(file_name) <= 10:

        with open(file_name, "wb") as f:

            r = requests.get(url, headers=headers, verify=False)

            f.write(r.content)

            f.close()

            print("\t下载成功:{}".format(file_name))

    else:

        print("\t文件已存在:{}".format(file_name))

# 课件地址  存储路径  范围[a, b](a章到第b章,默认[0, 0]表示全部)

def get_courseware(courseware_url, path, c_range=[0, 0]):

    t = 0

    while t < 2:

        try:

            driver.get(courseware_url)

            h3 = wait.until(

                EC.element_to_be_clickable(

                    (By.CSS_SELECTOR, "#g-body > div.m-learnhead > div > div > div > a.f-fl > h4"))

            )

            school_name = re.findall(r'/([a-zA-Z]+)-', courseware_url)[0]

            title = h3.text

            path_1 = os.path.join(path, title + "_" + school_name)

            if not os.path.exists(path_1):

                os.makedirs(path_1)

            path = os.path.join(path_1, "courseware")

            if not os.path.exists(path):

                os.makedirs(path)

            # 总章节数

            h3_count = len(driver.find_elements_by_css_selector(

                "div > div.m-learnChapterList> div.m-learnChapterNormal > div.titleBox > h3"))

            if c_range[1] == 0:

                c_range2 = h3_count

            else:

                c_range2 = c_range[1]

            for index in range(3 + c_range[0], 3 + c_range2):

                driver.refresh()

                h3 = wait.until(

                    EC.element_to_be_clickable((By.CSS_SELECTOR,

                                                "div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child(3) > div.titleBox > h3"))

                )

                h3.click()

                h3 = wait.until(

                    EC.element_to_be_clickable((By.CSS_SELECTOR,

                                                "div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child({}) > div.titleBox > h3".format(

                                                    index)))

                )

                h3_text = h3.text

                print("{}:".format(h3_text), end="\t")

                patten = re.compile('.*?(.{1,3})(|).*?')

                match = re.match(patten, h3_text)

                if match:

                    week = match.group(0)

                else:

                    week = h3_text

                h3.click()

                time.sleep(3)

                #                 file_count = len(driver.find_elements_by_xpath('//div[@class="f-icon lsicon f-fl "]/span[@class="u-icon-doc"]'))

                file_count = len(driver.find_elements_by_xpath('//div[@class="sourceList"]/*[@title="文档讲稿"]'))

                print(file_count)

                h4_count = len(driver.find_elements_by_css_selector('div.u-learnLesson > h4'))

                for h4_index in range(1, h4_count + 1):

                    h4 = wait.until(

                        EC.element_to_be_clickable(

                            (By.CSS_SELECTOR, 'div.u-learnLesson:nth-of-type({}) > h4.j-name'.format(h4_index)))

                    )

                    # 标题4

                    h4str = h4.text

                    file_count = len(driver.find_elements_by_css_selector(

                        f'div.u-learnLesson:nth-of-type({h4_index}) > div.sourceList > div[title^="文档"]'))

                    for f_index in range(1, file_count + 1):

                        title = wait.until(

                            EC.element_to_be_clickable((By.CSS_SELECTOR,

f'div.u-learnLesson:nth-of-type({h4_index}) > div.sourceList > div[title^="文档"]'))

                        )

                        titlestr = title.get_attribute("title")

                        title.click()

                        time.sleep(0.2)

                        download_btn = wait.until(

                            EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, '文档下载'))

                        )

                        download_url = download_btn.get_attribute("href")

                        week = week.replace(":", "-").replace("/", " ").replace("\\", " ").replace("课件:", " ").replace(

                            "", " ")

                        titlestr = f'{h4str} {titlestr}'

                        title = titlestr.replace(":", "-").replace("/", " ").replace("\\", " ").replace("课件:",

" ").replace(

                            "", " ").replace("/", " ")

                        print(week, "   ", title)

                        file_name = path + "\\" + week + " " + "".join(title.split()).replace("", " ") + "." + \

                                    download_url.split(".")[-1].split('&')[0]

                        print(file_name)

                        download(download_url, file_name)

                        driver.back()

                        time.sleep(1)

                        h3 = wait.until(

                            EC.element_to_be_clickable((By.CSS_SELECTOR,

                                                        "div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child(3) > div.titleBox > h3"))

                        )

                        h3.click()

                        h3 = wait.until(

                            EC.element_to_be_clickable((By.CSS_SELECTOR,

                                                        "div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child({}) > div.titleBox > h3".format(

                                                            index)))

                        )

                        h3.click()

            t = 5

        except FileNotFoundError:

            print("FileNotFoundError: [Errno 2] No such file or directory: ")

            t += 1

def main():

    courseware_url = '/learn/XDU-1001638014?tid=1462808447#/learn/content'

    path = r"D:\大二下\信号与系统\中国大学MOOC"

    # 课件地址  存储路径  范围[a, b](a章到第b章,默认[0, 0]表示全部)

    get_courseware(courseware_url, path, [0, 0])

    driver.quit()  # 退出浏览器

if __name__ == '__main__':

    main()

阅读(1347) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~