Go – Finding and removing duplicate files on a Mac

My laptop’s hard drive is plagued with the same family pics stored in different folders. So, I wrote a simple app in Go to take care of this for me. The app does this by comparing MD5 hashes of files.

Usage

Assuming you saved the code in a file called main.go, here is how you can use it to find duplicate files:

go run main.go -dir /some/dir -dir=/another/dir 

This will print duplicate files to the terminal. I added the flag `-dupe_action=sym` flag in the app which will sym-link all duplicates to one file.

go run main.go -dir /some/dir -dir=/another/dir -dupe_action=sym
package main

import (
	"os/exec"
	"fmt"
	"strings"
	"os"
	"crypto/md5"
	"io"
	"encoding/hex"
	"flag"
)

// This type is here so we can accept a list of directories
type arrayFlags []string

func (r *arrayFlags) String() string {
	return "something"
}

func (r *arrayFlags) Set(value string) error {
	*r = append(*r, value)
	return nil
}

// getFilesInDirectories gets all the files in directories recursively
func getFilesInDirectories(dir []string) []string {
	var ret []string

	for _, item := range dir {
		// This could have been done with pure Go, but I was lazy             
		c := exec.Command(`find`, item, `-type`, `f`, `-iname`, `*.jpg`)
		outBytes, _ := c.Output()

		files := strings.Split(string(outBytes), "\n")
		for _, fileItem := range files {
			ret = append(ret, fileItem)
		}
	}

	return ret
}

// getFileMd5 returns the md5 hash of a file
func getFileMd5(path string) (string, error) {
	file, err := os.Open(path)
	if err != nil {
		return ``, err
	}

	defer file.Close()

	hash := md5.New()
	if _, err := io.Copy(hash, file); err != nil {
		return ``, err
	}

	hashInBytes := hash.Sum(nil)[:16]
	ret := hex.EncodeToString(hashInBytes)

	return ret, nil
}

func main() {
	var dupeAction string
	var lookInDirectories arrayFlags

	flag.StringVar(&dupeAction, "action", ``, `Action to take with duplicate files. Value: sym`)
	flag.Var(&lookInDirectories, `dir`, `Directory to look in`)

	flag.Parse()

	// Get all relevant files
	output := getFilesInDirectories(lookInDirectories)

	fileMap := make(map[string][]string)
	for _, file := range output {
		hash, err := getFileMd5(file)
		if err != nil {
			fmt.Println("MD5_ERROR", file, err)
		}
		fileMap[hash] = append(fileMap[hash], file)
	}

	// Print dupes
	for _, item := range fileMap {
		// Do nothing if the file has no duplicates
		if len(item) <= 1 {
			continue
		}

		firstFile := item[0]
		for i := 1; i < len(item); i++ {
			file := item[i]
			fmt.Println(file)

			// Sym link dupes if flag was set
			if dupeAction == `sym` {
				err := os.Remove(file)
				if err != nil {
					fmt.Println(`Failed to delete `, file, err)
				}
				exec.Command(`ln`, `-s`, firstFile, file).Run()
			}
		}
		fmt.Println()
	}
}



Posted

in

by

Comments

Leave a Reply

Your email address will not be published. Required fields are marked *