Python 程序写好了,把你的博客地址写上,执行,会自动下载你的博客所有文章和图片。这个程序主要是针对 tinypic.com 来的,因为那个图片网站今年要关闭,所以把图片都存起来。。 如果你会用Python 程序,希望试用,希望听到你的改进意见。
""" Run the followings in CMD window: make a new directory for blog pages chcp 936 if no Chinese python wenxuecity2.py """ import urllib2, re, os # *** Modify url1 to be your wenxuecity blog *** #url1="http://blog.wenxuecity.com/myindex/33408/" #dingzhuang url1="http://blog.wenxuecity.com/myindex/24769/" #xiuyuan #url1="https://blog.wenxuecity.com/myoverview/26805/" #test #same images for page def saveHtml(page, _link, time): link = re.search('http.+html', _link).group() f1 = urllib2.urlopen(link) s1 = f1.read() #title = re.search('(?).+(?=).+?(?=)', s1, re.DOTALL).group().strip() #print(content.decode("utf8") ) img = re.findall('', content, re.IGNORECASE) cnt = 1 for x in img: src = re.findall('http.+jpg', x, re.IGNORECASE) if len(src) == 0: continue des = "images/P%04dI%03d.jpg" % (page, cnt) try: f1 = urllib2.urlopen(src[0]) f2 = open(des, "wb") f2.write(f1.read()) f2.close() except: pass content = content.replace(src[0], des) cnt += 1 f2 = open("P%04d.htm" % page, "wb") f2.write(_link + " " + time + "n" + content + "n") f2.close() # process pages def getPage(_page, url1): id = re.findall("[0-9]+", url1)[0] url2="http://blog.wenxuecity.com/blog/frontend.php?page=0&act=articleList&blogId=" url2 = url2.replace("0", str(_page))+id f1 = urllib2.urlopen(url2) s1 = f1.read().split("n") count = 0 for i in range(len(s1)): if s1[i].find('class="atc_title"')>0: link = s1[i+1].strip()+s1[i+2].strip()+s1[i+3].strip() count = count+1 if(count > 0): print _page*60+count, link.decode("utf8") time = re.search('(?).+(?=