在我的防爬虫程序中是这样子调用的:
cat $LIST | \
$PY_ACCESS G -gh | \
awk '{if( NR>2 && $1~/[0-9]+/&&"-"!~$2 && $3!="" ) print $1,$3}' | \
while read num ip
do
if [[ $num -gt $FZ ]]
then
echo "$ip" >> $GRAY
else
continue
fi
done
FZ是异常访问量的阀值
我的防爬虫脚本:http://blog.chinaunix.net/u3/102731/showart.php?id=2266335
python分析脚本的代码
#!/usr/bin/python
import sys, time, re
import getopt
USAGE = """Usage: python access.py (F|G|T|FG|FT) [OPTION]... [FILE]
Analyze access.log of resin by FILTER, GROUP and TREND
If no FILE, read standard input.
Command description
F filter access log by one or more field. writing to standard output.
G group access log by field(s) and count lines or sum bytes.
T trend chart for access log, x axis is time, default unit is one
minute, y axis is numbers of line per unit or summary of bytes
per unit times.
FG group the filtered content.
FT trend the filtered content.
Arguemnts to filter.
-a regexp filter agent field use regular expression
-h regexp filter host field use regular expression
-r regexp filter request field use regular expression
-s regexp filter status field use regular expression
-t RANGE filter time field use RANGE (time range)
IMPORT: time filter is slow!
RANGE format is YYYYmmddHHMM, examples as follow:
20070815:20070816 is 200708150000 <= time <= 200708160000
20070815: is 200708150000 <= time
:20070816 is time <= 200708160000
200708151200:200708151315
is 200708151200 <= time <= 200708151315
Arguemnts to group.
-g FIELDS group fields, one character is one field, default is h
multi-fields is supoported.
eg: hRa is host+request_uri+agent
a - agent
h - host
r - request url
R - request uri
s - status
--byte group target summary bytes, default is --count
--count group target count request times, this is default
--top=n list top n group result, default is 10
Arguemnts to trend chart
-u unit time unit, default 60 seconds
--count same as group
--byte same as group
"""
class Request:
def get(self, field):
if field == 'h':
return self.host
elif (field == 'a'):
return self.agent
elif (field == 'r'):
return self.request
elif (field == 'R'):
return self.request_uri
elif (field == 's'):
return self.status
return field
def byte_int(self):
if self.byte.isalnum():
return int(self.byte)
else:
return 0
def parse(line):
r = Request()
s = re.match('(\S+) - - \[(\S+) \S+] "\S+ (\S+) \S+" (\d+) (\S+) "([^"]*)" "(.*)"', line)
if s == None:
print line
return None
r.host = s.group(1)
r.time = s.group(2)
r.request = s.group(3)
i = r.request.find('?')
if i < 1:
r.request_uri = r.request
else:
r.request_uri = r.request[:i]
r.status = s.group(4)
r.byte = s.group(5)
r.referer = s.group(6)
r.agent = s.group(7)
return r
parse = staticmethod(parse)
class TimeUtil:
format = '%d/%b/%Y:%H:%M:%S'
format_i = '%Y%m%d%H%M'
def str2time(date_str):
return time.mktime(time.strptime(date_str, TimeUtil.format))
def str2time_i(date_str):
# 200701010000
# y m d H M
ds = date_str + '0' * (12 - len(date_str))
return time.mktime(time.strptime(ds, TimeUtil.format_i))
def time2str(second):
return time.strftime(TimeUtil.format, time.localtime(second))
str2time = staticmethod(str2time)
str2time_i = staticmethod(str2time_i)
time2str = staticmethod(time2str)
class Filter:
def valid(self, value):pass
class TimeFilter(Filter):
def __init__(self, v):
range = v.split(':')
if range[0] == '':
self.start = 0
else:
self.start = TimeUtil.str2time_i(range[0])
if range[1] == '':
self.end = TimeUtil.str2time_i('9999')
else:
self.end = TimeUtil.str2time_i(range[1])
def valid(self, req):
tm = TimeUtil.str2time(req.time)
return self.start <= tm <= self.end
class CompositeFilter(Filter):
def __init__(self):
self.items = []
def add(self, filter):
self.items.append(filter)
def valid(self, value):
for filter in self.items:
if not filter.valid(value):
return False
return True
class RegexFilter(Filter):
def __init__(self, p, f):
self.pattern = re.compile(p)
self.field = f
def valid(self, req):
return self.pattern.search(req.get(self.field))
class Grouper:
def __init__(self, fields = 'h', count_or_byte = None):
self.g = fields
self.count_or_byte = 0
if 'byte' == count_or_byte:
self.count_or_byte = 1
self.map = {}
self.total = 0
self.top_size = 10
def group(self, value):
if not value: return
key = ''
c = 0
for f in self.g:
key += value.get(f)
c += 1
if c < len(self.g): key += '|'
if self.map.has_key(key):
if self.count_or_byte == 0:
self.map[key] += 1
self.total += 1
else:
byte = value.byte_int()
self.map[key] += byte
self.total += byte
else:
if self.count_or_byte == 0:
self.map[key] = 1
self.total += 1
else:
byte = value.byte_int()
self.map[key] = byte
self.total += byte
def top(self):
list = self.map.items()
list.sort(lambda x, y: cmp(y[1], x[1]))
return list[:self.top_size]
class Trend:
def __init__(self):
self.range = 60
self.start_time = 0
self.count_or_byte = 0
def peek_start_time(self, value):
self.start_time = value.time[:17] + ':00'
self.cur_time = self.start_time
self.limit = TimeUtil.str2time(self.start_time) + self.range - 1
self.total = 0
def process(self, value):
if not value: return
logtime = value.time
if TimeUtil.str2time(logtime) > self.limit:
print self.cur_time, self.total
self.limit += self.range
# process the range have no record
while TimeUtil.str2time(logtime) > self.limit:
print TimeUtil.time2str(self.limit - (self.range - 1)), 0
self.limit += self.range
self.cur_time = TimeUtil.time2str(self.limit - (self.range - 1))
if self.count_or_byte == 0:
self.total = 1
else:
self.total = value.byte_int()
else:
if self.count_or_byte == 0:
self.total += 1
else:
self.total += value.byte_int()
if __name__ == '__main__':
if len(sys.argv) < 2:
print USAGE
sys.exit(1)
cmd = sys.argv[1]
if cmd not in ('F', 'G', 'T', 'FG', 'FT'):
print USAGE
sys.exit(1)
# a t h r s g u
opts, args = getopt.getopt(sys.argv[2:], "a:t:h:r:s:g:u:", ['count', 'byte', 'top='])
timeFilter = None
## build filters
filter = CompositeFilter()
if cmd.find('F') != -1:
for o, a in opts:
if o in ('-a', '-h', '-r', '-s'):
filter.add(RegexFilter(a, o[1]))
print 'Filter:', o, a
continue
if o in ('-t'):
timeFilter = TimeFilter(a)
print 'Filter:', o, a
## build grouper
grouper = Grouper()
do_group = False
if cmd.find('G') != -1:
do_group = True
for o, a in opts:
if o in ('-g'):
grouper.g = a
print 'Group Field:', a
continue
if o == '--byte':
grouper.count_or_byte = 1
continue
if o == '--top':
print o, a
grouper.top_size = int(a)
continue
## build trend
trend = Trend()
do_trend = False
if cmd.find('T') != -1:
do_trend = True
for o, a in opts:
if o == '--byte':
trend.count_or_byte = 1
continue
if o == '-u':
trend.range = int(a)
## process data
input = None
if len(args) > 0:
input = open(args[0])
else:
input = sys.stdin
i = 0
is_first = True
span_lock = 0
is_time_filter = (timeFilter != None)
for line in input:
req = Request.parse(line.strip())
if not req: continue
if is_time_filter:
if not timeFilter.valid(req):
if span_lock == 0:
continue
else:
break
else:
span_lock += 1
if filter.valid(req):
if do_group:
grouper.group(req)
if do_trend:
if is_first:
trend.peek_start_time(req)
is_first = False
trend.process(req)
if (not do_trend) and (not do_group):
print line.strip()
i += 1
if do_group:
if i % 1000 == 0:
print '\b' * 100,
print i,
input.close()
if do_group: print
if do_group:
## print group result
for g in grouper.top():
print "%10d %7.4f%% %s" % (g[1], round(g[1] * 100.0 /grouper.total, 4), g[0])
print "%10d\t%s" % (grouper.total, 'TOTAL')
因为nginx和resin的access log输出有点不同所以在获取host useragent等属性时候就不一样主要体现在 def parse(line):方法中。所以最好能做格式统一输出。
|
文件: | access.zip |
大小: | 8KB |
下载: | 下载 |
|
里面有三个文件 access.py 针对resin
ngx-access.py ngx2-access.py针对nginx 0.6 0.7版本 因为默认的格式输出是不同。