做题+继续做题+还是做题=通过二级
python脚本
作者:飞 出处:3 添加时间:2008-1-26 16:27:13
# coding=utf-8
from HTMLParser import HTMLParser
import urllib2
def strip_tags(html):
html = html.strip()
html = html.strip("\n")
result = []
parser = HTMLParser()
parser.handle_data = result.append
parser.feed(html)
parser.close()
return ''.join(result)
#print strip_tags()
#f=open('t.htm')
#kk=f.read()
#f.close()
def prc_data(kk):
gg=kk.split("流行度")
#print gg[1]
gg[1]=gg[1].replace('
','')
gg[1]=strip_tags(gg[1])
hh=gg[1].strip().split("页")
kk=hh[0]
kk=kk.strip()
kk=kk.replace(' ','')
kk=kk.replace('后一','')
kk=kk.replace('前一','')
kk=kk.replace('\n','~')
#print kk
tt=kk.split('~~~')
f=open('result.txt','a')
for i in tt:
f.write(i+'\n')
#print i
f.close()
def getdata(vurl):
f=urllib2.urlopen(vurl)
v=f.read()
return v
def hh(dd):
ff=open('name.txt','a+')
ff.write(dd)
ff.close()
def spider():
#aurl="http://www.yingwenming.com/boy/boy-names-*.htm"
aurl="http://www.yingwenming.com/girl/girl-names-*.htm"
for i in range(65,91):
durl=aurl.replace('*',chr(i))
kk=getdata(durl)
prc_data(kk)
print 'curl '+durl+'>'+chr(i)+'.htm'
for j in range(2,20):
try:
durl=aurl.replace('*',chr(i)+'-p-%s' %j)
print durl
kk=getdata(durl)
prc_data(kk)
except:
break
spider()
from HTMLParser import HTMLParser
import urllib2
def strip_tags(html):
html = html.strip()
html = html.strip("\n")
result = []
parser = HTMLParser()
parser.handle_data = result.append
parser.feed(html)
parser.close()
return ''.join(result)
#print strip_tags()
#f=open('t.htm')
#kk=f.read()
#f.close()
def prc_data(kk):
gg=kk.split("流行度")
#print gg[1]
gg[1]=gg[1].replace('
','')gg[1]=strip_tags(gg[1])
hh=gg[1].strip().split("页")
kk=hh[0]
kk=kk.strip()
kk=kk.replace(' ','')
kk=kk.replace('后一','')
kk=kk.replace('前一','')
kk=kk.replace('\n','~')
#print kk
tt=kk.split('~~~')
f=open('result.txt','a')
for i in tt:
f.write(i+'\n')
#print i
f.close()
def getdata(vurl):
f=urllib2.urlopen(vurl)
v=f.read()
return v
def hh(dd):
ff=open('name.txt','a+')
ff.write(dd)
ff.close()
def spider():
#aurl="http://www.yingwenming.com/boy/boy-names-*.htm"
aurl="http://www.yingwenming.com/girl/girl-names-*.htm"
for i in range(65,91):
durl=aurl.replace('*',chr(i))
kk=getdata(durl)
prc_data(kk)
print 'curl '+durl+'>'+chr(i)+'.htm'
for j in range(2,20):
try:
durl=aurl.replace('*',chr(i)+'-p-%s' %j)
print durl
kk=getdata(durl)
prc_data(kk)
except:
break
spider()
> 浏览所有新闻

