Chinaunix首页 | 论坛 | 博客
  • 博客访问: 178714
  • 博文数量: 32
  • 博客积分: 553
  • 博客等级: 中士
  • 技术积分: 369
  • 用 户 组: 普通用户
  • 注册时间: 2006-06-17 18:18
文章分类

全部博文(32)

文章存档

2015年(1)

2014年(1)

2013年(9)

2012年(9)

2011年(12)

分类: 系统运维

2011-07-11 06:16:05

有效去除list中的重复值
2008-02-27 01:57
最近写一个小工具,顺便开始学用Python。觉得挺有趣(虽然OOP一直很让我头疼)

恰好昨天遇到一个问题,一个list中有许多重复的值,需要将它去重。找了半天,list没有相关的类似uniq这样的方法。于是上网找……结果找到一堆帖子都唆使大家用set。但是在我的机器上试了一下,还要import sets,否则……:

Traceback (most recent call last):
File "", line 1, in
NameError: name 'Set' is not defined

但是我已经import了一堆东西,并且这个方式在效率上不怎么样,觉得很不爽。于是继续找……终于,在某个牛人的站点上找到一篇,解决了困扰我至少15分钟的问题……

看了看文中提到的11种去重方式,选了一个最快的。

def f9(seq):
# Not order preserving
return {}.fromkeys(seq).keys()不需要import任何模块,直接用dictionary 内建的fromkeys和keys解决问题。很好,很强大。

上面提到的文章中一共给出了11种去重的方法,并且都以函数的方式定义了,直接粘出来就可以用(或者作为一个模块import之)。代码可以

为了避免以后找不到,我还是在这里粘出来吧……顺便再牢骚一下space没有代码高亮功能的问题…… -_-!

  1. from random import shuffle, randint
  2. import re
  3. from sets import Set

  4. def f1(seq): # Raymond Hettinger
  5.     # not order preserving
  6.     set = {}
  7.     map(set.__setitem__, seq, [])
  8.     return set.keys()

  9.     
  10. def f2(seq): # *********
  11.     # order preserving
  12.     checked = []
  13.     for e in seq:
  14.         if e not in checked:
  15.             checked.append(e)
  16.     return checked

  17. def f3(seq):
  18.     # Not order preserving
  19.     keys = {}
  20.     for e in seq:
  21.         keys[e] = 1
  22.     return keys.keys()

  23. def f4(seq): # ********** order preserving
  24.     noDupes = []
  25.     [noDupes.append(i) for i in seq if not noDupes.count(i)]
  26.     return noDupes

  27. def f5(seq, idfun=None): # Alex Martelli ******* order preserving
  28.     if idfun is None:
  29.         def idfun(x): return x
  30.     seen = {}
  31.     result = []
  32.     for item in seq:
  33.         marker = idfun(item)
  34.         # in old Python versions:
  35.         # if seen.has_key(marker)
  36.         # but in new ones:
  37.         if marker in seen: continue
  38.         seen[marker] = 1
  39.         result.append(item)
  40.     return result


  41. def f5b(seq, idfun=None): # Alex Martelli ******* order preserving
  42.     if idfun is None:
  43.         def idfun(x): return x
  44.     seen = {}
  45.     result = []
  46.     for item in seq:
  47.         marker = idfun(item)
  48.         # in old Python versions:
  49.         # if seen.has_key(marker)
  50.         # but in new ones:
  51.         if marker not in seen:
  52.             seen[marker] = 1
  53.             result.append(item)
  54.             
  55.     return result



  56. def f6(seq):
  57.     # Not order preserving
  58.     return list(Set(seq))

  59. def f7(seq):
  60.     # Not order preserving
  61.     return list(set(seq))

  62. def f8(seq): # Dave Kirby
  63.     # Order preserving
  64.     seen = set()
  65.     return [x for x in seq if x not in seen and not seen.add(x)]

  66. def f9(seq):
  67.     # Not order preserving
  68.     return {}.fromkeys(seq).keys()

  69. def f10(seq, idfun=None): # Andrew Dalke
  70.     # Order preserving
  71.     return list(_f10(seq, idfun))

  72. def _f10(seq, idfun=None):
  73.     seen = set()
  74.     if idfun is None:
  75.         for x in seq:
  76.             if x in seen:
  77.                 continue
  78.             seen.add(x)
  79.             yield x
  80.     else:
  81.         for x in seq:
  82.             x = idfun(x)
  83.             if x in seen:
  84.                 continue
  85.             seen.add(x)
  86.             yield x
  87.             
  88.             
  89. def f11(seq): # f10 but simpler
  90.     # Order preserving
  91.     return list(_f10(seq))

  92. def _f11(seq):
  93.     seen = set()
  94.     for x in seq:
  95.         if x in seen:
  96.             continue
  97.         seen.add(x)
  98.         yield x
  99.             
  100. import time

  101. def timing(f, n, a):
  102.     print f.__name__,
  103.     r = range(n)
  104.     t1 = time.clock()
  105.     for i in r:
  106.         f(a); f(a); f(a); f(a); f(a); f(a); f(a); f(a); f(a); f(a)
  107.     t2 = time.clock()
  108.     print round(t2-t1, 3)
  109.     



  110. def getRandomString(length=10, loweronly=1, numbersonly=0,
  111.                     lettersonly=0):
  112.     """ return a very random string """
  113.     _letters = 'abcdefghijklmnopqrstuvwxyz'
  114.     if numbersonly:
  115.         l = list('0123456789')
  116.     elif lettersonly:
  117.         l = list(_letters + _letters.upper())
  118.     else:
  119.         lowercase = _letters+'0123456789'*2
  120.         l = list(lowercase + lowercase.upper())
  121.     shuffle(l)
  122.     s = ''.join(l)
  123.     if len(s) < length:
  124.         s = s + getRandomString(loweronly=1)
  125.     s = s[:length]
  126.     if loweronly:
  127.         return s.lower()
  128.     else:
  129.         return s

  130. testdata = {}
  131. for i in range(35):
  132.     k = getRandomString(5, lettersonly=1)
  133.     v = getRandomString(100 )
  134.     testdata[k] = v
  135.     
  136. testdata = [int(x) for x in list('21354612')]
  137. testdata += list('abcceeaa5efm')
  138. class X:
  139.     def __init__(self, n):
  140.         self.foo = n
  141.     def __repr__(self):
  142.         return ""%self.foo
  143.     def __cmp__(self, e):
  144.         return cmp(self.foo, e.foo)
  145.         
  146. testdata = []
  147. for i in range(10000):
  148.     testdata.append(getRandomString(3, loweronly=True))
  149. #testdata = ['f','g','c','d','b','a','a']


  150. order_preserving = f2, f4, f5, f5b, f8, f10, f11
  151. order_preserving = f5, f5b, f8, f10, f11

  152. not_order_preserving = f1, f3, f6, f7, f9
  153. testfuncs = order_preserving + not_order_preserving


  154. for f in testfuncs:
  155.     if f in order_preserving:
  156.         print "*",
  157.     timing(f, 100, testdata)
阅读(2253) | 评论(2) | 转发(0) |
给主人留下些什么吧!~~

eilinx2012-10-21 21:48:05

今天又在看python文档,发现一个函数式set 用来去除序列中的重复值,话说这个内置命令取出list中的重复值应该更高效吧,毕竟是内置的函数,以后有机会可以试试

testdata = ['f','g','c','d','b','a','a']
dd = list(set(testdata))
print dd
['a', 'c', 'b', 'd', 'g', 'f']