评书吧批量下载脚本
通常只要修改Page_name和total_pages即可实现自动换页批量下载,喜欢听书的朋友可以试试。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | #coding=utf8 # Author: Ting Li 2010-08 import httplib, sys, time import HTMLParser global download_links download_links=[] # request the page def GetUrl(ServerAdr,PagePath, Page): # print ServerAdr,PagePath conn = httplib.HTTPConnection(ServerAdr) conn.request('GET',PagePath+Page) # print conn # conn.putrequest('GET', PagePath) # conn.putheader('Accept', 'text/html') # conn.endheaders() # conn.send("") res = conn.getresponse() # print "code=", res.status if res.status != 200: raise Exception("Could not get document: Check URL and Path. " ) data = res.read() conn.close() # data=unicode(data, 'gb2312') return data #parse the page and return the part between the start and end token def ExtractData(in_string, start_line, end_line): lstr=in_string.splitlines() #split j=0 #set counter to zero slice_start=0 slice_end=0 for i in lstr: if start_line in i.strip(): slice_start=j #find slice start elif end_line in i.strip(): slice_end=j+1 #find slice end j=j+1 return lstr[slice_start:slice_end] #return slice class MyHTMLParser(HTMLParser.HTMLParser): global download_links def __init__(self,basename,filter): HTMLParser.HTMLParser.__init__(self) self.basename=basename self.filter=filter self.flag=False self.url='' def handle_data(self,data): # pass tag = self.get_starttag_text() # if data.strip() and ('href' in tag): print data if self.flag: if data.strip(): # print 'tag: ', tag # print 'data: ',data print data else: print self.url def handle_starttag(self,tag,attrs): if tag=='a': for n,v in attrs: if n=='href' and (self.filter in v): self.flag=True # print 'myparser: ',self.get_starttag_text() #print the url in plain text # print self.basename+v #print the url in html b=self.basename if ('../' in v) and (self.basename.count('/')>1): b='/'.join(self.basename.rsplit('/')[:-2]) v=v.replace('../','/') print '<a href="'+b+v+'">' + 'test'#+ \ # self.basename+v self.url=b+v # print self.url download_links.append(b+v) def handle_endtag(self,tag): if tag=='a' and self.flag: self.flag=False print '</a><br />' def ParseHyperlinks(html,base,filter): mp = MyHTMLParser(base,filter) mp.feed(html) #handle the returned stuff and generate a new page def main(): global download_links # parameter and constants ServerAdr='www.pingshu8.com' PagePath='/MusicList/' filter='down_' StartLine='<body>' EndLine='</body>' coding='gb2312' Head1='<html><head>' meta='<meta http-equiv="Content-Type" content="text/html; charset=%s" />' \ % coding Head2='</head><body>' Foot='</body></html>' count=0 delay = 30 #in seconds start_page=2 start_file=9 #指定要下载的页面(需根据节目页面修改),一般只需要该这个模板即可 Page_name='mmc_239_958_%s.Htm' #loop over all pages automatically #指定页面总数 total_pages=3 page_range=range(1,total_pages+1) for ip,ipage in enumerate(page_range): if ip+1<start_page: continue Page=Page_name % ipage print 'Handling page %s ... ' % Page count+=1 print "ipage=",ipage #sleep a while to prevent being kick off by the website if ipage % 5 == 0: time.sleep(delay) download_links=[] # print "link1: ",download_links # call functions RawData=GetUrl(ServerAdr, PagePath,Page) # RawData=RawData.replace('&','and') #预处理 newdata='' v=ExtractData(RawData, StartLine, EndLine) # print v for i in v: # print i.strip() if 'list4' in i: newdata=newdata+i.strip()+'\n' # print newdata # HTMLParser.HTMLParser().feed(RawData) # sys.exit(1) print Head1+meta+Head2 ParseHyperlinks(newdata,'http://'+ServerAdr+PagePath, filter) print Foot #now open the page in the next level dlist=[] # print "link2",download_links # break for i in download_links: print i #split out the basename page=i.split('http://'+ServerAdr)[1] # print page htm=GetUrl(ServerAdr, page,'') # print htm #find out the link containing the download link the gif for l in htm.split(): # print l if ('.mp3?' in l): # print l b=l.find('href="')+6 str=l[b:] # print str e=str.find('"') url=str[:e] # print url #parse the name of the mp3 file e=url.rfind('?') b=url.rfind('/')+1 fname=url[b:e] # print fname dlist.append([fname,url]) print dlist #start the actual downloading f=open('download_list.txt','w') for i in dlist: print >> f, i[0], i[1] f.close() import os,urllib for id,i in enumerate(dlist): if id+1<start_file: continue print "downloading %s ..." % i[0] #using wget os.system('wget -c -O '+i[0]+' '+i[1]) #using urllib # urllib.urlretrieve(i[1],i[0]) start_file=1 #call main function #print "你好!" main() |
友情支持~!支持