import re
import urllib.request as ur
import time
import os
import threading
from urllib.error import URLError, HTTPError
folerpath = '169mm'
def gethtml(url):
try:
req = ur.Request(url)
except ValueError as e:
print('value Error',e.reason)
return
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')
try:
response = ur.urlopen(req)
except URLError as e:
print('URLError reason:',e.reason)
return
try:
html = response.read()
except:
return
return html
'''保图片存到本地'''
def writeImgToFolder(hexData,subfolerpath):
with open(subfolerpath,'wb') as fp:
fp.write(hexData)
'''
将当前图片写真页面中的图片,保存到本地
http://www.169bb.com/gaogensiwa/2016/0808/36632.html
文件夹名称即为图片标题名
'''
def getImgSrcAndDownload(html,needchangeFolder,lastfolder,lastnum):
global folerpath
start = html.find('<title>')
end = html.find('</title>',start)
'''如果需要修改图片文件夹名称,则修改'''
'''否则用之前的名称'''
if (needchangeFolder):
imgtitle = html[start+len(r'<title>'):end]
subfolerpath = folerpath +"\\"+imgtitle
num = 0
else:
num = lastnum
subfolerpath = lastfolder
try:
os.mkdir(subfolerpath)
except:
pass
pat = re.compile(r'"center"><img src="')
iter1 = pat.finditer(html)
for i in iter1:
#print(i.group(),i.span())
tmp = i.group()
tail = html.find(' ',i.span()[1])
#print(html[i.span()[1]:tail-1])
theImgSrc = html[i.span()[1]:tail-1]
'''此处得到图片的字节集 '''
imghex = gethtml(theImgSrc)
imgPath = subfolerpath +"\\" + str(num) + '.jpg'
writeImgToFolder(imghex,imgPath)
num += 1
'''代表没找到图片,特征码不一样,改成查找center'''
if (num == lastnum):
pat = re.compile(r'"center"')
iter1 = pat.finditer(html)
for i in iter1:
tmp = i.group()
tail = html.find('jpg',i.span()[1])
theImgSrc = html[i.span()[1] + 3 + len('<img src=') + 2:tail+3]
'''此处得到图片的字节集'''
imghex = gethtml(theImgSrc)
imgPath = subfolerpath +"\\" + str(num) + '.jpg'
writeImgToFolder(imghex,imgPath)
num += 1
return (subfolerpath,num)
'''
得到第一层每一页中图片页面的地址
比如http://www.169bb.com/gaogensiwa/2016/0808/36632.html
http://www.169bb.com/gaogensiwa/2016/0808/36632_2.html
'''
def _getAllPageUrl(url):
subhtml = gethtml(url)
if (subhtml == None):
return
subhtml = subhtml.decode('GBK')
tup = getImgSrcAndDownload(subhtml,True,'',0)
lastFolder = tup[0]
lastnum = tup[1]
for j in range(2,6):
nextpage = url
nextpage = nextpage[0:len(nextpage)-5] + '_' +str(j)+'.html'
#print('nextpage:',nextpage)
subhtml = gethtml(nextpage)
if (subhtml == None):
continue
subhtml = subhtml.decode('GBK')
tup = getImgSrcAndDownload(subhtml,False,lastFolder,lastnum)
lastFolder = tup[0]
lastnum = tup[1]
time.sleep(0.1)
def getAllPageUrl(html):
pat = re.compile(r'http://www.169bb.com/gaogensiwa/\d{4}/\d{4}/\d{5}.html')
iter1 = pat.finditer(html)
thread_arr=[]
for i in iter1:
t = threading.Thread(target=_getAllPageUrl,args = (i.group(),))
thread_arr.append(t)
for i in thread_arr:
i.start()
for i in thread_arr:
i.join()
def main():
global folerpath
folerpath = os.getcwd()
folerpath += r'\169mm'
try:
os.mkdir(folerpath)
except:
pass
os.chdir(folerpath)
'''第一层需要遍历的页数'''
for i in range(1,3):
html = gethtml('http://www.169bb.com/gaogensiwa/list_3_%d.html'%i)
if (html == None):
continueo
html = html.decode('GBK')
getAllPageUrl(html)
if __name__=='__main__':
main()