Chinaunix首页 | 论坛 | 博客
  • 博客访问: 657053
  • 博文数量: 149
  • 博客积分: 3901
  • 博客等级: 中校
  • 技术积分: 1558
  • 用 户 组: 普通用户
  • 注册时间: 2009-02-16 14:33
文章分类

全部博文(149)

文章存档

2014年(2)

2013年(10)

2012年(32)

2011年(21)

2010年(84)

分类: 数据库开发技术

2011-04-07 17:58:53


1.  url  加个 '/' 前缀
   /
   /xx.html?a=b..

2.
  

    /*
     * 预处理 后的 url 处理 request_url : 返回 处理后的 (domain,url,domain_url ) - : (-,-,-)
     * : (xx,/cc,xx/cc) / 或者 '' : ( /,/,/ ) :
     * (xx,/?a=a,xx/?a=a) : (xx,?a=a,xx?a=a)
     *
     * /it/shopping/mobile/ : (/,/it/shopping/mobile/,/it/shopping/mobile/)
     *
     * 合并 domain url
     */

    
    public static String joinUrl(String domain, String url) {

        if (domain == null)
            domain = "";
        if (url == null)
            url = "";
        
        domain = getDomain(domain);
        url = getAbsUrl(url);

        return domain + url;
    }

    
    private static Pattern p_d = Pattern.compile("^(.*?)/(.*)$");
    public static String getDomain(String url){
        if (url == null)url = "";
        url = url.trim();
        Matcher md = p_d.matcher(url);
        if(md.find()){
            url = md.group(1);
        }
        return url;
    }
    
    private static Pattern p_e = Pattern.compile("/+$");
    private static Pattern p_s = Pattern.compile("^/+");
    public static String getAbsUrl(String adsUrl) {
        if (adsUrl == null)adsUrl = "";
        
        adsUrl = "/"+getUrl(adsUrl);
        
        Matcher md = p_d.matcher(adsUrl);
        if(md.find()){
            adsUrl = md.group(2);
        }
        
        Matcher ms = p_s.matcher(adsUrl);
        if (ms.find()) {
            adsUrl = ms.replaceAll("");
        }
        return adsUrl;
    }

    private static Pattern p_w = Pattern.compile("^(.*?)\\?.*");
    public static String getUrl(String url) {
        if (url == null)url = "";
        Matcher mw = p_w.matcher(url);
        if (mw.find()) {
            url = mw.group(1);
        }
        
        Matcher ms = p_e.matcher(url);
        if (ms.find()) {
            url = ms.replaceAll("");
        }
        
        
        return url ;
    }


# request_url : 返回 处理后的 (domain,url,domain_url )
    # - : (-,-,-)
    # http://xx/cc : (xx,/cc,xx/cc)
    # / 或者 '' : ( /,/,/ )
    # http://xx/?a=a : (xx,/?a=a,xx/?a=a)
    # http://xx?a=a : (xx,?a=a,xx?a=a)
    #
    # /it/shopping/mobile/ : (/,/it/shopping/mobile/,/it/shopping/mobile/)
    def _getDomainUrl(self,request_url) :
        if request_url in ['','/']: return ('/','/','/')
        if request_url == '-' : return ('-','-','-')
        domain,url = ("","")
        http_num = request_url.find('http://')+7
        https_num = request_url.find('https://')+8
        if http_num >= 7 : request_url = request_url[http_num:]
        if https_num >= 8 : request_url = request_url[https_num:]
        ux,uw = request_url.find('/'),request_url.find('?')
        if ux==-1 and uw==-1 :
            spl = -1
        elif ux!=-1 and uw!=-1 :
            spl = min( ux,uw )
        else :
            if ux!=-1 :spl = ux
            elif uw!=-1 : spl = uw

        if spl == -1 :
            domain = request_url
            url = '/'
        else :
            domain = request_url[:spl]
            url = request_url[spl:]
        if url == '/' : domain_url = domain
        else : domain_url = domain + url
        if domain == '' : domain = '/'
        return (domain,url,domain_url)


阅读(1002) | 评论(0) | 转发(0) |
0

上一篇:hadoop 二次排序

下一篇:url 预处理 细节

给主人留下些什么吧!~~