python下载离线的网页以 gitbook上的某个页面

文章目录
  1. 1. 读取文件是出现UnicodeDecodeError
  2. 2. 在python环境下window和Linux分隔符的区别
  3. 3. 读取指定文件路径下父文件夹如果不存在着直接新建

先上代码 本代码有参考

https://blog.csdn.net/gorquanwu/article/details/81739589 这篇文章去实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# author: leek
# date:2021-6-23
from urllib import request
from bs4 import BeautifulSoup as bs
import time
import os
import re

'''
用来爬取网站网页 gitbook 页面 离线下载到本地
实现功能:url深度抓取,保存每个页面的css、html、js等文件
'''


# 深度爬取当前页面子网站子网站
def get_urls(url, baseurl, urls):
with request.urlopen(url) as f:
data = f.read().decode('utf-8')
# link = bs(data).find_all('a')
link = bs(data).find("nav").find_all('a')
for i in link:
suffix = i.get('href')
# 设置排除写入的子连接
if suffix == '#' or suffix == '#carousel-example-generic' or 'javascript:void(0)' in suffix:
continue
else:
# 构建urls
childurl = baseurl +"/"+ suffix
if childurl not in urls:
urls.append(childurl)

# 获取每个页面代码以及获取页面上的css,js,img路径
def get_source(url, path):
try:
with request.urlopen(url) as f:
html_source = f.read().decode()
# 添加时间截以区分文件夹名字
timeStr = str(int(time.time()))
pattertitile = '<title>(.*?)</title>'
patternimg = '<img src="(.*?)"'
titleStr = re.compile(pattertitile, re.S).findall(html_source)[0]
if '|' in titleStr:
title = (titleStr.split("|")[1]).split(' ')[1] + timeStr
else:
title = titleStr + timeStr

path11 = path + '/' + title
arrayurl= url.split('/')
htmlFile = path
for x in range(3, len(arrayurl)):
htmlFile +=("/"+arrayurl[x])

# 获取css,js,img地址
imgHref = re.compile(patternimg, re.S).findall(html_source)
# 创建文件路径下的父节点
os.makedirs(os.path.abspath(htmlFile + os.path.sep + ".."), exist_ok=True)
# 读取并保存html
with open(htmlFile, 'w', encoding='UTF-8') as f:
f.write(html_source)
print(htmlFile+ "文件保存成功")
time.sleep(1)
except:
print(url + "保存html文件时报错")

# 保存js文件
def save_css_js(path):
url = "http://sdk.g-bim.cn"
filename = path
with request.urlopen(url) as total_html:
html_source = total_html.read().decode()
jsHref = re.compile('<script src="(.*?)"', re.S).findall(html_source)
cssHref = re.compile( '<link rel="stylesheet" href="(.*?)"', re.S).findall(html_source)

for j in jsHref :
try:
with request.urlopen(url+"/"+j) as ww:
js_source = ww.read().decode()
# filename =(path+'\\'+ j).replace('/','\\') os.getcwd()
filename =path+j
os.makedirs(os.path.abspath(filename+os.path.sep+".."), exist_ok=True)
with open(filename, 'w', encoding='UTF-8') as f:
f.write(js_source)
print(j.split('/')[-1] + " js文件保存成功")
time.sleep(1)
except:
print("该" + j.split('/')[-1] + " js文件无法下载")


for k in cssHref:
try:
with request.urlopen(url+"/"+k) as vv:
filename = path+ k
js_source = vv.read().decode()
os.makedirs(os.path.abspath(filename + os.path.sep + ".."), exist_ok=True)
with open(filename, 'w', encoding='UTF-8') as f:
f.write(js_source)
print(k.split('/')[-1] + " js文件保存成功")
time.sleep(1)
except:
print("该" + k.split('/')[-1] + " js文件无法下载")


# 保存img文件
def save_img(href, path):
for i in range(0, len(href)):
url = "http://sdk.g-bim.cn" + href[i]
filename = path + '\\' + href[i].split('/')[-1]
try:
with request.urlopen(url) as w:
img_source = w.read()
with open(filename, 'wb') as f:
f.write(img_source)
print(href[i].split('/')[-1] + " 图像文件保存成功")
time.sleep(1)
except:
print("该" + href[i].split('/')[-1] + " 图像无法下载")
continue


if __name__ == '__main__':
# 抓取网址
url = 'http://sdk.g-bim.cn'
# 相对路径地址
baseurl = 'http://sdk.g-bim.cn'
# 文件保存位置
basedir = r'C:\Users\Administrator\Desktop\HTML_bak'
urls = []
# 获取所有地址
get_urls(url, baseurl, urls)
# save_css_js(r'../html_bak/')
# 获取代码
for u in urls:
get_source(u,r'../html_bak')

期间有些问题

读取文件是出现UnicodeDecodeError

UnicodeDecodeError: ‘gbk’ codec can’t decode byte 0x89 in position 14: illegal

open(‘order.log’,’r’, encoding=’UTF-8’) 尽量指定u8目前HTML的编码一般都是它避免乱码

在python环境下window和Linux分隔符的区别

1
2
3
4
5
6
path=r'../html_bak/'
with open(path, 'w', encoding='UTF-8') as f:

\\windows平台但是 加上r后 自动加 不用管全部用/即可 在打开文件时会自动管理
https://blog.csdn.net/qq_29831163/article/details/106263729

读取指定文件路径下父文件夹如果不存在着直接新建

1
os.makedirs(os.path.abspath(filename+os.path.sep+".."), exist_ok=True)