import sys
from BeautifulSoup import BeautifulSoup
import re
import urllib2
import csv
def get_name_price(url):
print(url)
response = urllib2.urlopen(url)
print("get response")
html=response.read()
soup = BeautifulSoup(html,fromEncoding="gbk")
print("soup complete")
name=soup.findAll("div",{"class":"name"})
print("get name")
price=soup.findAll("span",{"class":"price_type"})
print("get price")
next=soup.findAll("div",{"class":"searchListPage"})
l=len(next[0].contents)
b=u'\u5c3e\u9875'
lasturl=""
for i in range(1,l,2):
if next[0].contents[i]['class']==u's4':
if next[0].contents[i].contents[0].string==b:
lasturl=next[0].contents[i].contents[0]["href"]
break;
if(lasturl==""):
lasturl=url
return (name,price,lasturl)
if len(sys.argv) < 2:
url="%CE%AB%B7%BB______%D7%A1%D5%AC___________1.htm"
else:
url=sys.argv[1]
l=len(url)
ptn=url[0:l-5]
(name,price,lasturl)=get_name_price(url)
if url!=lasturl:
lasturl=""+lasturl
allname=name
allprice=price
lptn=len(ptn)
cnt=int(lasturl[lptn:len(lasturl)-4])
f=open("house_price.xls","w")
alllen=len(name)
for i in range(0,alllen):
print name[i].contents[1].string,price[i].string,name[i].contents[1]['href']
f.write(name[i].contents[1].string.encode("gbk"))
f.write(" ")
f.write(price[i].string.encode("gbk"))
f.write(" ")
f.write(name[i].contents[1]['href']+"\r\n")
f.flush()
print '-------------------------------'
for i in range(2,cnt+1):
url=ptn+str(i)+".htm"
(name,price,tem)=get_name_price(url)
allname+=name;
allprice+=price
alllen=len(name)
for i in range(0,alllen):
print name[i].contents[1].string,price[i].string,name[i].contents[1]['href']
f.write(name[i].contents[1].string.encode("gbk"))
f.write(" ")
f.write(price[i].string.encode("gbk"))
f.write(" ")
f.write(name[i].contents[1]['href']+"\r\n")
f.flush();
print '-------------------------------'
|