1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
|
from urllib import request from bs4 import BeautifulSoup as bs import time import os import re
''' 用来爬取网站网页 gitbook 页面 离线下载到本地 实现功能:url深度抓取,保存每个页面的css、html、js等文件 '''
def get_urls(url, baseurl, urls): with request.urlopen(url) as f: data = f.read().decode('utf-8') link = bs(data).find("nav").find_all('a') for i in link: suffix = i.get('href') if suffix == '#' or suffix == '#carousel-example-generic' or 'javascript:void(0)' in suffix: continue else: childurl = baseurl +"/"+ suffix if childurl not in urls: urls.append(childurl)
def get_source(url, path): try: with request.urlopen(url) as f: html_source = f.read().decode() timeStr = str(int(time.time())) pattertitile = '<title>(.*?)</title>' patternimg = '<img src="(.*?)"' titleStr = re.compile(pattertitile, re.S).findall(html_source)[0] if '|' in titleStr: title = (titleStr.split("|")[1]).split(' ')[1] + timeStr else: title = titleStr + timeStr
path11 = path + '/' + title arrayurl= url.split('/') htmlFile = path for x in range(3, len(arrayurl)): htmlFile +=("/"+arrayurl[x])
imgHref = re.compile(patternimg, re.S).findall(html_source) os.makedirs(os.path.abspath(htmlFile + os.path.sep + ".."), exist_ok=True) with open(htmlFile, 'w', encoding='UTF-8') as f: f.write(html_source) print(htmlFile+ "文件保存成功") time.sleep(1) except: print(url + "保存html文件时报错")
def save_css_js(path): url = "http://sdk.g-bim.cn" filename = path with request.urlopen(url) as total_html: html_source = total_html.read().decode() jsHref = re.compile('<script src="(.*?)"', re.S).findall(html_source) cssHref = re.compile( '<link rel="stylesheet" href="(.*?)"', re.S).findall(html_source)
for j in jsHref : try: with request.urlopen(url+"/"+j) as ww: js_source = ww.read().decode() filename =path+j os.makedirs(os.path.abspath(filename+os.path.sep+".."), exist_ok=True) with open(filename, 'w', encoding='UTF-8') as f: f.write(js_source) print(j.split('/')[-1] + " js文件保存成功") time.sleep(1) except: print("该" + j.split('/')[-1] + " js文件无法下载")
for k in cssHref: try: with request.urlopen(url+"/"+k) as vv: filename = path+ k js_source = vv.read().decode() os.makedirs(os.path.abspath(filename + os.path.sep + ".."), exist_ok=True) with open(filename, 'w', encoding='UTF-8') as f: f.write(js_source) print(k.split('/')[-1] + " js文件保存成功") time.sleep(1) except: print("该" + k.split('/')[-1] + " js文件无法下载")
def save_img(href, path): for i in range(0, len(href)): url = "http://sdk.g-bim.cn" + href[i] filename = path + '\\' + href[i].split('/')[-1] try: with request.urlopen(url) as w: img_source = w.read() with open(filename, 'wb') as f: f.write(img_source) print(href[i].split('/')[-1] + " 图像文件保存成功") time.sleep(1) except: print("该" + href[i].split('/')[-1] + " 图像无法下载") continue
if __name__ == '__main__': url = 'http://sdk.g-bim.cn' baseurl = 'http://sdk.g-bim.cn' basedir = r'C:\Users\Administrator\Desktop\HTML_bak' urls = [] get_urls(url, baseurl, urls) for u in urls: get_source(u,r'../html_bak')
|