Chinaunix首页 | 论坛 | 博客
  • 博客访问: 3666021
  • 博文数量: 365
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 2522
  • 用 户 组: 普通用户
  • 注册时间: 2019-10-28 13:40
文章分类

全部博文(365)

文章存档

2023年(8)

2022年(130)

2021年(155)

2020年(50)

2019年(22)

我的朋友

分类: Python/Ruby

2021-08-09 17:26:03

import csv

import hashlib

import os

from tkinter import *

import numpy as np

import requests

from PIL import Image

import xlwt

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)like Gecko)'}

class image():

    def baidu_img(self, keword, num):

        base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word={}'.format(keword)

        path1 = r"E:\图片\\" + keword

        y = os.path.exists(path1)

        if y == 0:

            os.mkdir(path1)

        else:

            pass

        response = requests.get(base_url, headers=headers)

        html_str = response.text

        # html = parsel.Selector(html_str)

        # img_href = html.xpath('//li/div/a/img/@src').extract()   #利用xpath提取图片路径

        pic_url = re.findall('"objURL":"(.*?)",', html_str, re.S)  # 利用正则表达式找到图片url

        # print(pic_url)

        n = 0

        for i in pic_url:

            try:

                img = requests.get(i, headers=headers).content

                img_name = i.split('=')[-1]

                with open(path1 + '\\' + img_name + '.jpg', 'wb')as f:

                    f.write(img)

                    n = n + 1

                    with open(path1 + '.csv', 'a', newline='')as ff:

                        csvwriter = csv.writer(ff, dialect='excel')

                        csvwriter.writerow([img_name, i])

                    if n >= num:

                        break

            except Exception as e:

                print(e)

    def md5(self, dirName):

        files = os.listdir(dirName)  # 遍历文件夹下的所有文件

        temp = set()  # 创建一个set()

        count = 0  # 删除的文件计数

        for file in files:

            if file.lower().endswith(('jpg', 'jpeg', 'png')):

                file_path = os.path.join(dirName, file)  # 获得完整的路径

                try:

                    img = Image.open(file_path)  # 打开图片

                    img_array = np.array(img)  # 转为数组

                    md5 = hashlib.md5()  # 创建一个hash对象

                    md5.update(img_array)  # 获得当前文件的md5

                    if md5.hexdigest() not in temp:  # 如果当前的md5码不在集合中

                        temp.add(md5.hexdigest())  # 则添加当前md5码到集合中

                    else:

                        os.remove(file_path)

                        count += 1  # 否则删除图片数加一

                except Exception as e:

                    os.remove(file_path)

        print("duplicate removal:", count)  # 最后输出删除图片的总数

    def rename(self, dirName):

        for root, dirs, files in os.walk(dirName):

            i = 0

            for file_name in files:

                if file_name.lower().endswith(('jpg', 'jpeg', 'png')):

                    oldname = os.path.join(root, file_name)

                    pic_format = os.path.splitext(oldname)[-1]

                    name = 'hivision_buiing_' + root.split('\\')[-1]

                    print(name)

                    newname = root + '/' + name + '_' + str(i + 1).zfill(4) + pic_format

                    i = i + 1

                    print(newname)

                try:

                    os.rename(oldname, newname)

                except Exception as f:

                    print(f)

    def get_dirs_num(self):

        dict = {}

        f = xlwt.Workbook()

        sheet1 = f.add_sheet(u'统计文件数量', cell_overwrite_ok=True)

        row = 0

        row0 = ['文件夹路径', '文件夹名称', '文件数量']

        for n in range(len(row0)):

            sheet1.write(0, n, row0[n])

        path = r'E:\图片'

        for root, dirs, files in os.walk(path):

            num = 0

            for dir_files in os.listdir(root):

                if os.path.isfile(os.path.join(root, dir_files)):

                    if dir_files.lower().endswith(('.jpg', 'jpeg', 'png')):

                        num = num + 1

            name = root.replace(path + '\\', '')

            dict[name] = num

        for key, values in dict.items():

            col = 0

            if values > 0:

                sheet1.write(row + 1, col, os.path.join(path, key))

                col = col + 1

                name_list = key.split('\\')

                row = row + 1

                sheet1.write(row, col, name_list)

                col = col + 1

                sheet1.write(row, col, values)

        f.save(r'E:\图片\\count.xls')

if __name__ == '__main__':

    keyword = ''

    dirName = r"E:\图片\\{}".format(keyword)

    num =20

    test = image()

    test.baidu_img(keyword, num)

    test.md5(dirName)

    test.rename(dirName)

    test.get_dirs_num()

阅读(1498) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~