Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1493995
  • 博文数量: 408
  • 博客积分: 10036
  • 博客等级: 上将
  • 技术积分: 4440
  • 用 户 组: 普通用户
  • 注册时间: 2006-04-06 13:57
文章分类

全部博文(408)

文章存档

2011年(1)

2010年(2)

2009年(1)

2008年(3)

2007年(7)

2006年(394)

我的朋友

分类: Python/Ruby

2006-06-20 16:55:41

VCKbase的论坛上有人提了一个问题:
遇到的问题是,我把E-mail地址收集在txt文件中(每行一个E-mail地址,共78000行) ,要求是把其中重复的E-mail地址全部去掉。考虑编程方便,我用了CString类,但似乎 效率不高。我写出来的整理78000份E-mail地址的需要近7分钟的时间

我给他用Python实现了一个(这个版本是修改过的,当时我随手实现的一个版本做错了,呵呵)
这里算法的主要内容就是利用hash_table来判断email是否已经重复出现,这比字符串的查找要快的多。

#remove duplicated email address from file
import datetime

if __name__ == "__main__":
t = datetime.datetime(2000,1,1)
print str(t.today())
hashtable = {}
f = file("email.txt","r")
f2 = file("email_new.txt","w")
line = f.readline();
while len(line)>0:
if not hashtable.has_key(line):
hashtable[line] = 1
f2.write(line)
line = f.readline();
f.close()
f2.close()
t2 = datetime.datetime(2000,1,1)
print str(t2.today())

上面的算法在我的机器上:P4 2.8G,512M 内存,耗时大约300ms,跟他的7分钟差了天了。 而我当时错误的算法,需要2分半钟,所以我贴到我们内部的BBS上,我的一个同学实现了如下算法(C):
#include 
#include
#include
#include "list.h"

#define HASH_SIZE 262157
#define LINE_MAX_LEN 1024

struct list_node {
struct list_head head;
int len;
char *str;
};

struct list_head hash_table[HASH_SIZE];

static inline int
hash_string (const char *ptr, int len)
{
unsigned int hash = 0;

for (hash = 0; len; len--, ptr++) {
/* (31 * hash) will probably be optimized to ((hash << 5) - hash). */
hash = 31 * hash + *ptr;
}

return (hash % HASH_SIZE);
}

static inline int
cmpfn(const struct list_node *node, int len, char *str)
{
return ((node->len == len) && (memcmp(node->str, str, len) == 0));
}

int main(int argc, char **argv)
{
int i, hash, len;
struct list_node *node;
char buf[LINE_MAX_LEN];
FILE *fp_in, *fp_out;

if(argc != 3) {
printf("error: \nplease use eml as:\n eml infile outfile\n");
return -1;
}
fp_in = fopen(argv[1], "r");
if(!fp_in) {
printf("error: could not open infile %s\n", argv[1]);
return -1;
}
fp_out = fopen(argv[2], "w");
if(!fp_out) {
printf("error: could not open infile %s\n", argv[2]);
return -1;
}
for(i=0; istr = (char *)malloc(len + 1);
node->len = len;
memcpy(node->str, buf, len);
node->str[len] = '\0';
list_append(&(hash_table[hash]), node);
}
}

return 0;
}
还有.h文件:

#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

extern inline void prefetch(const void *x)
{
return;
}
/*
* Simple doubly linked list implementation.
*
* Some of the internal functions ("__xxx") are useful when
* manipulating whole lists rather than single entries, as
* sometimes we already know the next/prev entries and we can
* generate better code by using them directly rather than
* using the generic single-entry routines.
*/

struct list_head {
struct list_head *next, *prev;
};

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
struct list_head name = LIST_HEAD_INIT(name)

#define INIT_LIST_HEAD(ptr) do { \
(ptr)->next = (ptr); (ptr)->prev = (ptr); \
} while (0)

/*
* Insert a new entry between two known consecutive entries.
*
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
static inline void
__list_add(struct list_head *new,
struct list_head *prev, struct list_head *next)
{
next->prev = new;
new->next = next;
new->prev = prev;
prev->next = new;
}

/**
* list_add - add a new entry
* @new: new entry to be added
* @head: list head to add it after
*
* Insert a new entry after the specified head.
* This is good for implementing stacks.
*/
static inline void
list_add(struct list_head *new, struct list_head *head)
{
__list_add(new, head, head->next);
}

/**
* list_add_tail - add a new entry
* @new: new entry to be added
* @head: list head to add it before
*
* Insert a new entry before the specified head.
* This is useful for implementing queues.
*/
static inline void
list_add_tail(struct list_head *new, struct list_head *head)
{
__list_add(new, head->prev, head);
}

/*
* Delete a list entry by making the prev/next entries
* point to each other.
*
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
static inline void
__list_del(struct list_head *prev, struct list_head *next)
{
next->prev = prev;
prev->next = next;
}

/**
* list_del - deletes entry from list.
* @entry: the element to delete from the list.
* Note: list_empty on entry does not return true after this, the entry is in an undefined state.
*/
static inline void
list_del(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
entry->next = (void *) 0;
entry->prev = (void *) 0;
}

/**
* list_del_init - deletes entry from list and reinitialize it.
* @entry: the element to delete from the list.
*/
static inline void
list_del_init(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
INIT_LIST_HEAD(entry);
}

/**
* list_move - delete from one list and add as another's head
* @list: the entry to move
* @head: the head that will precede our entry
*/
static inline void
list_move(struct list_head *list, struct list_head *head)
{
__list_del(list->prev, list->next);
list_add(list, head);
}

/**
* list_move_tail - delete from one list and add as another's tail
* @list: the entry to move
* @head: the head that will follow our entry
*/
static inline void
list_move_tail(struct list_head *list, struct list_head *head)
{
__list_del(list->prev, list->next);
list_add_tail(list, head);
}

/**
* list_empty - tests whether a list is empty
* @head: the list to test.
*/
static inline int
list_empty(struct list_head *head)
{
return head->next == head;
}

static inline void
__list_splice(struct list_head *list, struct list_head *head)
{
struct list_head *first = list->next;
struct list_head *last = list->prev;
struct list_head *at = head->next;

first->prev = head;
head->next = first;

last->next = at;
at->prev = last;
}

/**
* list_splice - join two lists
* @list: the new list to add.
* @head: the place to add it in the first list.
*/
static inline void
list_splice(struct list_head *list, struct list_head *head)
{
if (!list_empty(list))
__list_splice(list, head);
}

/**
* list_splice_init - join two lists and reinitialise the emptied list.
* @list: the new list to add.
* @head: the place to add it in the first list.
*
* The list at @list is reinitialised
*/
static inline void
list_splice_init(struct list_head *list, struct list_head *head)
{
if (!list_empty(list)) {
__list_splice(list, head);
INIT_LIST_HEAD(list);
}
}

/**
* list_entry - get the struct for this entry
* @ptr: the &struct list_head pointer.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_struct within the struct.
*/
#define list_entry(ptr, type, member) \
((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))

/**
* list_for_each - iterate over a list
* @pos: the &struct list_head to use as a loop counter.
* @head: the head for your list.
*/
#define list_for_each(pos, head) \
for (pos = (head)->next, prefetch(pos->next); pos != (head); \
pos = pos->next, prefetch(pos->next))

/**
* list_for_each_safe - iterate over a list safe against removal of list entry
* @pos: the &struct list_head to use as a loop counter.
* @n: another &struct list_head to use as temporary storage
* @head: the head for your list.
*/
#define list_for_each_safe(pos, n, head) \
for (pos = (head)->next, n = pos->next; pos != (head); \
pos = n, n = pos->next)

/**
* list_for_each_entry - iterate over list of given type
* @pos: the type * to use as a loop counter.
* @head: the head for your list.
* @member: the name of the list_struct within the struct.
*/
#define list_for_each_entry(pos, head, member) \
for (pos = list_entry((head)->next, typeof(*pos), member), \
prefetch(pos->member.next); \
&pos->member != (head); \
pos = list_entry(pos->member.next, typeof(*pos), member), \
prefetch(pos->member.next))

/*this is the begin of listhelp ;-)*/

#define LIST_FIND(head, cmpfn, type, args...) \
({ \
const struct list_head *__i = (head); \
\
do { \
__i = __i->next; \
if (__i == (head)) { \
__i = NULL; \
break; \
} \
} while (!cmpfn((const type)__i , ## args)); \
(type)__i; \
})

#define LIST_FIND_W(head, cmpfn, type, args...) \
({ \
const struct list_head *__i = (head); \
\
do { \
__i = __i->next; \
if (__i == (head)) { \
__i = NULL; \
break; \
} \
} while (!cmpfn((type)__i , ## args)); \
(type)__i; \
})

static inline int
__list_cmp_same(const void *p1, const void *p2)
{
return p1 == p2;
}

/* Is this entry in the list? */
static inline int
list_inlist(struct list_head *head, const void *entry)
{
return LIST_FIND(head, __list_cmp_same, void *, entry) !=NULL;
}

/* Delete from list. */
#ifdef CONFIG_NETFILTER_DEBUG
#define LIST_DELETE(head, oldentry) \
do { \
if (!list_inlist(head, oldentry)) \
printk("LIST_DELETE: %s:%u `%s'(%p) not in %s.\n", \
__FILE__, __LINE__, #oldentry, oldentry, #head); \
else list_del((struct list_head *)oldentry); \
} while(0)
#else
#define LIST_DELETE(head, oldentry) list_del((struct list_head *)oldentry)
#endif

/* Append. */
static inline void
list_append(struct list_head *head, void *new)
{
list_add((new), (head)->prev);
}

/* Prepend. */
static inline void
list_prepend(struct list_head *head, void *new)
{
list_add(new, head);
}

/* Insert according to ordering function; insert before first true. */
#define LIST_INSERT(head, new, cmpfn) \
do { \
struct list_head *__i; \
for (__i = (head)->next; \
!cmpfn((new), (typeof (new))__i) && __i != (head); \
__i = __i->next); \
list_add((struct list_head *)(new), __i->prev); \
} while(0)

/* If the field after the list_head is a nul-terminated string, you
can use these functions. */
static inline int
__list_cmp_name(const void *i, const char *name)
{
return strcmp(name, i + sizeof (struct list_head)) == 0;
}

static inline int
__list_cmp_nameptr(const void *i, const char *name)
{
return strcmp(name, ((char *)
*(unsigned long *) (i +
sizeof (struct list_head)))) ==
0;
}

/* Returns false if same name already in list, otherwise does insert. */
static inline int
list_named_insert(struct list_head *head, void *new)
{
if (LIST_FIND(head, __list_cmp_name, void *,
new + sizeof (struct list_head)))
return 0;
list_prepend(head, new);
return 1;
}

static inline int
list_nameptr_insert(struct list_head *head, void *new)
{
if (LIST_FIND(head, __list_cmp_nameptr, void *,
new + sizeof (struct list_head)))
return 0;
list_prepend(head, new);
return 1;
}

/* Find this named element in the list. */
#define list_named_find(head, name) \
LIST_FIND(head, __list_cmp_name, void *, name)

#define list_nameptr_find(head, name) \
LIST_FIND(head, __list_cmp_nameptr, void *, name)

#endif

在我的机器上测试(cygwin):耗时竟然是400ms!
当然,这里包含了Cygwin的间接调用开销。

很难想象吧,Python在这里的性能竟然跟C不相上下!而且Python的代码比C不知道短了多少倍!
所以讲,Python是蛮好用的,:-).
不过,这个例子其实是个特殊情况,以前我们玩过的另外一个程序,C的性能就比Python快得多。不过代码量还是Python简短得多。 有兴趣的朋友可以用自己熟悉的语言来测试一下,下面是测试文件(也是我用11行python代码产生的,有1/10的email是重复的):

阅读(1256) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~