昨天一个帮一个同学找北京到西安的火车票,突然想到旷同学正好写了一个,但是需要指定火车票的是日期,于是就在上面稍微改了下,并增加了打印链接地址。
checkhcp.py:
#!/usr/bin/python
# encoding: utf-8
import urllib2
import urlparse
import mailbox
import os
import sys
import re
from sgmllib import SGMLParser
import time
class URLListName(SGMLParser):
is_a=""
name=[]
link={}
def start_a(self, attrs):
self.is_a=1
self.attrs = attrs
def end_a(self):
self.is_a=""
def handle_data(self, text):
ret = text.find('发车日期')
if self.is_a and ret != -1:
for atname,atvalue in self.attrs:
if atname == 'href':
self.link[text] = atvalue
if len(sys.argv) < 5:
mesg = """
用法: python checkhcp.py 始发站 终点站 月 日 关键字
例如,查找2月10号北京到西安的卧铺票:
# python checkhcp.py beijing xian 2 10 卧铺
注:始发站和终点站必须用拼音,关键字可以是车次,车票类型,车票张数等。
"""
print mesg
sys.exit(1)
start = sys.argv[1]
end = sys.argv[2]
month = sys.argv[3]
day = sys.argv[4]
key = ''
if len(sys.argv) == 6:
key = sys.argv[5]
url = "%s-%s/?sort=d1&Cat1=%s月&Cat3=%s日&page=1" % (start, end, month, day)
sequence = 60#60 * 5
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0')
opener = urllib2.build_opener()
data = opener.open(request).read()
ticket_name = URLListName()
ticket_name.feed(data)
cache=[]
while 1:
try:
print "beign retrive"
data = opener.open(request).read()
ticket_name.feed(data)
ticket_name.name.extend(ticket_name.link)
print "beign scan"
for result in ticket_name.name:
if result and result.find(key) >= 0:
if result in cache:
pass
else:
print '\033[91m' + "found:" + result + '\033[0m'
print ticket_name.link[result]
cache.append(result)
print "scan finished, begin sleep " + str(sequence) + " seconds."
#print ticket_name.link
time.sleep(sequence)
except:
raise
|
阅读(2957) | 评论(0) | 转发(0) |