2021-06-23

python下载离线的网页以 gitbook上的某个页面

文章目录

1. 读取文件是出现UnicodeDecodeError
2. 在python环境下window和Linux分隔符的区别
3. 读取指定文件路径下父文件夹如果不存在着直接新建

先上代码本代码有参考

https://blog.csdn.net/gorquanwu/article/details/81739589 这篇文章去实现

# author: leek
# date：2021-6-23
from urllib import request
from bs4 import BeautifulSoup as bs
import time
import os
import re

'''
    用来爬取网站网页  gitbook 页面 离线下载到本地
    实现功能：url深度抓取，保存每个页面的css、html、js等文件
'''


# 深度爬取当前页面子网站子网站
def get_urls(url, baseurl, urls):
    with request.urlopen(url) as f:
        data = f.read().decode('utf-8')
        # link = bs(data).find_all('a')
        link = bs(data).find("nav").find_all('a')
        for i in link:
            suffix = i.get('href')
            # 设置排除写入的子连接
            if suffix == '#' or suffix == '#carousel-example-generic' or 'javascript:void(0)' in suffix:
                continue
            else:
                # 构建urls
                childurl = baseurl +"/"+ suffix
                if childurl not in urls:
                    urls.append(childurl)

# 获取每个页面代码以及获取页面上的css，js，img路径
def get_source(url, path):
    try:
        with request.urlopen(url) as f:
            html_source = f.read().decode()
            # 添加时间截以区分文件夹名字
            timeStr = str(int(time.time()))
            pattertitile = '<title>(.*?)</title>'
            patternimg = '<img src="(.*?)"'
            titleStr = re.compile(pattertitile, re.S).findall(html_source)[0]
            if '|' in titleStr:
                title = (titleStr.split("|")[1]).split(' ')[1] + timeStr
            else:
                title = titleStr + timeStr

            path11 = path + '/' + title
            arrayurl= url.split('/')
            htmlFile = path
            for x in range(3, len(arrayurl)):
                htmlFile  +=("/"+arrayurl[x])

            # 获取css，js，img地址
            imgHref = re.compile(patternimg, re.S).findall(html_source)
            # 创建文件路径下的父节点
            os.makedirs(os.path.abspath(htmlFile + os.path.sep + ".."), exist_ok=True)
            # 读取并保存html
            with open(htmlFile, 'w', encoding='UTF-8') as f:
                f.write(html_source)
            print(htmlFile+ "文件保存成功")
            time.sleep(1)
    except:
        print(url + "保存html文件时报错")

# 保存js文件
def save_css_js(path):
        url = "http://sdk.g-bim.cn"
        filename = path
        with request.urlopen(url) as total_html:
            html_source = total_html.read().decode()
            jsHref = re.compile('<script src="(.*?)"', re.S).findall(html_source)
            cssHref = re.compile( '<link rel="stylesheet" href="(.*?)"', re.S).findall(html_source)

            for j in jsHref :
               try:
                   with request.urlopen(url+"/"+j) as ww:
                       js_source = ww.read().decode()
                       # filename =(path+'\\'+ j).replace('/','\\') os.getcwd()
                       filename =path+j
                       os.makedirs(os.path.abspath(filename+os.path.sep+".."), exist_ok=True)
                       with open(filename, 'w', encoding='UTF-8') as f:
                           f.write(js_source)
                       print(j.split('/')[-1] + " js文件保存成功")
                       time.sleep(1)
               except:
                   print("该" + j.split('/')[-1] + " js文件无法下载")


            for k in cssHref:
                try:
                    with request.urlopen(url+"/"+k) as vv:
                        filename = path+ k
                        js_source = vv.read().decode()
                        os.makedirs(os.path.abspath(filename + os.path.sep + ".."), exist_ok=True)
                        with open(filename, 'w', encoding='UTF-8') as f:
                            f.write(js_source)
                        print(k.split('/')[-1] + " js文件保存成功")
                        time.sleep(1)
                except:
                    print("该" + k.split('/')[-1] + " js文件无法下载")


# 保存img文件
def save_img(href, path):
    for i in range(0, len(href)):
        url = "http://sdk.g-bim.cn" + href[i]
        filename = path + '\\' + href[i].split('/')[-1]
        try:
            with request.urlopen(url) as w:
                img_source = w.read()
                with open(filename, 'wb') as f:
                    f.write(img_source)
                print(href[i].split('/')[-1] + " 图像文件保存成功")
                time.sleep(1)
        except:
            print("该" + href[i].split('/')[-1] + " 图像无法下载")
            continue


if __name__ == '__main__':
    # 抓取网址
    url = 'http://sdk.g-bim.cn'
    # 相对路径地址
    baseurl = 'http://sdk.g-bim.cn'
    # 文件保存位置
    basedir = r'C:\Users\Administrator\Desktop\HTML_bak'
    urls = []
    # 获取所有地址
    get_urls(url, baseurl, urls)
    # save_css_js(r'../html_bak/')
    # 获取代码
    for u in urls:
        get_source(u,r'../html_bak')

期间有些问题

读取文件是出现UnicodeDecodeError

UnicodeDecodeError: ‘gbk’ codec can’t decode byte 0x89 in position 14: illegal

open(‘order.log’,’r’, encoding=’UTF-8’) 尽量指定u8目前HTML的编码一般都是它避免乱码

在python环境下window和Linux分隔符的区别

path=r'../html_bak/'
 with open(path, 'w', encoding='UTF-8') as f:
 
 \\windows平台但是 加上r后 自动加 不用管全部用/即可 在打开文件时会自动管理
 https://blog.csdn.net/qq_29831163/article/details/106263729

读取指定文件路径下父文件夹如果不存在着直接新建

1	os.makedirs(os.path.abspath(filename+os.path.sep+".."), exist_ok=True)

本文标题:python下载离线的网页以 gitbook上的某个页面

文章作者:

发布时间:2021-06-23, 22:29:34

最后更新:2024-03-14, 22:55:29

原始链接:https://imlike.cc/1a58ae58.html

许可协议: "署名-非商用-相同方式共享 4.0" 转载请保留原文链接及作者。