python下载离线的网页以 gitbook上的某个页面
先上代码 本代码有参考
https://blog.csdn.net/gorquanwu/article/details/81739589 这篇文章去实现
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 from urllib import requestfrom bs4 import BeautifulSoup as bsimport timeimport osimport re''' 用来爬取网站网页 gitbook 页面 离线下载到本地 实现功能:url深度抓取,保存每个页面的css、html、js等文件 ''' def get_urls (url, baseurl, urls ): with request.urlopen(url) as f: data = f.read().decode('utf-8' ) link = bs(data).find("nav" ).find_all('a' ) for i in link: suffix = i.get('href' ) if suffix == '#' or suffix == '#carousel-example-generic' or 'javascript:void(0)' in suffix: continue else : childurl = baseurl +"/" + suffix if childurl not in urls: urls.append(childurl) def get_source (url, path ): try : with request.urlopen(url) as f: html_source = f.read().decode() timeStr = str (int (time.time())) pattertitile = '<title>(.*?)</title>' patternimg = '<img src="(.*?)"' titleStr = re.compile (pattertitile, re.S).findall(html_source)[0 ] if '|' in titleStr: title = (titleStr.split("|" )[1 ]).split(' ' )[1 ] + timeStr else : title = titleStr + timeStr path11 = path + '/' + title arrayurl= url.split('/' ) htmlFile = path for x in range (3 , len (arrayurl)): htmlFile +=("/" +arrayurl[x]) imgHref = re.compile (patternimg, re.S).findall(html_source) os.makedirs(os.path.abspath(htmlFile + os.path.sep + ".." ), exist_ok=True ) with open (htmlFile, 'w' , encoding='UTF-8' ) as f: f.write(html_source) print(htmlFile+ "文件保存成功" ) time.sleep(1 ) except : print(url + "保存html文件时报错" ) def save_css_js (path ): url = "http://sdk.g-bim.cn" filename = path with request.urlopen(url) as total_html: html_source = total_html.read().decode() jsHref = re.compile ('<script src="(.*?)"' , re.S).findall(html_source) cssHref = re.compile ( '<link rel="stylesheet" href="(.*?)"' , re.S).findall(html_source) for j in jsHref : try : with request.urlopen(url+"/" +j) as ww: js_source = ww.read().decode() filename =path+j os.makedirs(os.path.abspath(filename+os.path.sep+".." ), exist_ok=True ) with open (filename, 'w' , encoding='UTF-8' ) as f: f.write(js_source) print(j.split('/' )[-1 ] + " js文件保存成功" ) time.sleep(1 ) except : print("该" + j.split('/' )[-1 ] + " js文件无法下载" ) for k in cssHref: try : with request.urlopen(url+"/" +k) as vv: filename = path+ k js_source = vv.read().decode() os.makedirs(os.path.abspath(filename + os.path.sep + ".." ), exist_ok=True ) with open (filename, 'w' , encoding='UTF-8' ) as f: f.write(js_source) print(k.split('/' )[-1 ] + " js文件保存成功" ) time.sleep(1 ) except : print("该" + k.split('/' )[-1 ] + " js文件无法下载" ) def save_img (href, path ): for i in range (0 , len (href)): url = "http://sdk.g-bim.cn" + href[i] filename = path + '\\' + href[i].split('/' )[-1 ] try : with request.urlopen(url) as w: img_source = w.read() with open (filename, 'wb' ) as f: f.write(img_source) print(href[i].split('/' )[-1 ] + " 图像文件保存成功" ) time.sleep(1 ) except : print("该" + href[i].split('/' )[-1 ] + " 图像无法下载" ) continue if __name__ == '__main__' : url = 'http://sdk.g-bim.cn' baseurl = 'http://sdk.g-bim.cn' basedir = r'C:\Users\Administrator\Desktop\HTML_bak' urls = [] get_urls(url, baseurl, urls) for u in urls: get_source(u,r'../html_bak' )
期间有些问题
读取文件是出现UnicodeDecodeError UnicodeDecodeError: ‘gbk’ codec can’t decode byte 0x89 in position 14: illegal
open(‘order.log’,’r’, encoding=’UTF-8’) 尽量指定u8目前HTML的编码一般都是它避免乱码
在python环境下window和Linux分隔符的区别 1 2 3 4 5 6 path=r'../html_bak/' with open(path, 'w', encoding='UTF-8') as f: \\windows平台但是 加上r后 自动加 不用管全部用/即可 在打开文件时会自动管理 https://blog.csdn.net/qq_29831163/article/details/106263729
读取指定文件路径下父文件夹如果不存在着直接新建 1 os.makedirs(os.path.abspath(filename+os.path.sep+".."), exist_ok=True)