My laptop’s hard drive is plagued with the same family pics stored in different folders. So, I wrote a simple app in Go to take care of this for me. The app does this by comparing MD5 hashes of files.
Usage
Assuming you saved the code in a file called main.go, here is how you can use it to find duplicate files:
go run main.go -dir /some/dir -dir=/another/dir
This will print duplicate files to the terminal. I added the flag `-dupe_action=sym` flag in the app which will sym-link all duplicates to one file.
go run main.go -dir /some/dir -dir=/another/dir -dupe_action=sym
package main
import (
"os/exec"
"fmt"
"strings"
"os"
"crypto/md5"
"io"
"encoding/hex"
"flag"
)
// This type is here so we can accept a list of directories
type arrayFlags []string
func (r *arrayFlags) String() string {
return "something"
}
func (r *arrayFlags) Set(value string) error {
*r = append(*r, value)
return nil
}
// getFilesInDirectories gets all the files in directories recursively
func getFilesInDirectories(dir []string) []string {
var ret []string
for _, item := range dir {
// This could have been done with pure Go, but I was lazy
c := exec.Command(`find`, item, `-type`, `f`, `-iname`, `*.jpg`)
outBytes, _ := c.Output()
files := strings.Split(string(outBytes), "\n")
for _, fileItem := range files {
ret = append(ret, fileItem)
}
}
return ret
}
// getFileMd5 returns the md5 hash of a file
func getFileMd5(path string) (string, error) {
file, err := os.Open(path)
if err != nil {
return ``, err
}
defer file.Close()
hash := md5.New()
if _, err := io.Copy(hash, file); err != nil {
return ``, err
}
hashInBytes := hash.Sum(nil)[:16]
ret := hex.EncodeToString(hashInBytes)
return ret, nil
}
func main() {
var dupeAction string
var lookInDirectories arrayFlags
flag.StringVar(&dupeAction, "action", ``, `Action to take with duplicate files. Value: sym`)
flag.Var(&lookInDirectories, `dir`, `Directory to look in`)
flag.Parse()
// Get all relevant files
output := getFilesInDirectories(lookInDirectories)
fileMap := make(map[string][]string)
for _, file := range output {
hash, err := getFileMd5(file)
if err != nil {
fmt.Println("MD5_ERROR", file, err)
}
fileMap[hash] = append(fileMap[hash], file)
}
// Print dupes
for _, item := range fileMap {
// Do nothing if the file has no duplicates
if len(item) <= 1 {
continue
}
firstFile := item[0]
for i := 1; i < len(item); i++ {
file := item[i]
fmt.Println(file)
// Sym link dupes if flag was set
if dupeAction == `sym` {
err := os.Remove(file)
if err != nil {
fmt.Println(`Failed to delete `, file, err)
}
exec.Command(`ln`, `-s`, firstFile, file).Run()
}
}
fmt.Println()
}
}
Leave a Reply