clean_dubs/compare_dirs.sh

#!/bin/bash
# compare_dirs.sh
#
# Usage: ./compare_dirs.sh [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>
#
# This script:
#   1. Scans immediate subdirectories in <dir1> and <dir2>.
#   2. For each directory, if its name contains any undesirable word (one per line in <words_file>),
#      the directory is removed outright.
#   3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
#      and then grouped by fuzzy similarity using a configurable threshold.
#      The fuzzy similarity process is optimized with a multiprocessing helper.
#   4. Within each group, if one directory’s name contains "2160p" and another contains "1080p",
#      the 1080p directory(ies) are removed (or flagged in dry-run mode).
#   5. For any remaining duplicate groups, the user is prompted to select a directory to remove.
#   6. A --dry-run mode is available to preview removals without actually deleting any directories.

set -euo pipefail

# Default options
DRY_RUN=false
SIMILARITY_THRESHOLD=0.8

# Process command-line flags
while [[ "$1" == --* ]]; do
    case "$1" in
        --dry-run)
            DRY_RUN=true
            shift
            ;;
        --threshold)
            SIMILARITY_THRESHOLD="$2"
            shift 2
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
done

if [ "$#" -ne 3 ]; then
    echo "Usage: $0 [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>"
    exit 1
fi

DIR1="$1"
DIR2="$2"
WORDS_FILE="$3"

# Verify input paths
if [ ! -d "$DIR1" ]; then
    echo "Error: Directory '$DIR1' not found."
    exit 1
fi
if [ ! -d "$DIR2" ]; then
    echo "Error: Directory '$DIR2' not found."
    exit 1
fi
if [ ! -f "$WORDS_FILE" ]; then
    echo "Error: Words file '$WORDS_FILE' not found."
    exit 1
fi

# Read undesirable words (one per line) into an array, ignoring blank lines.
mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE")

echo "=== Pre-filtering Directories by Undesirable Words ==="
# Create an array to hold directories that do NOT match any undesirable word.
filtered_dirs=()

# Loop over immediate subdirectories in both DIR1 and DIR2.
for d in "$DIR1"/* "$DIR2"/*; do
    if [ -d "$d" ]; then
        base=$(basename "$d")
        remove_flag=false
        # Check if the directory name contains any undesirable word (case-insensitive).
        for word in "${words[@]}"; do
            if echo "$base" | grep -qi "$word"; then
                remove_flag=true
                break
            fi
        done
        if $remove_flag; then
            echo "Removing '$d' because it contains an undesirable word."
            if $DRY_RUN; then
                echo "Dry-run: would remove '$d'"
            else
                rm -rf "$d"
                echo "Removed '$d'"
            fi
        else
            filtered_dirs+=("$d")
        fi
    fi
done

# Function: Normalize and clean a directory name.
clean_name() {
    local name="$1"
    echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs
}

# Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing.
compute_similarities() {
    local target="$1"
    shift
    # Pass target and the list of representatives as command-line arguments to Python.
    python3 - "$target" "$@" <<EOF
import sys
from difflib import SequenceMatcher
from multiprocessing import Pool

target = sys.argv[1]
reps = sys.argv[2:]
def similarity(rep):
    return SequenceMatcher(None, target, rep).ratio()
with Pool() as pool:
    results = pool.map(similarity, reps)
print(" ".join(map(str, results)))
EOF
}

echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
# Initialize grouping arrays.
declare -a group_rep=()   # Array for representative cleaned names.
declare -A groups=()      # Associative array: groups[i] holds newline-separated directory paths.

# Group the directories in filtered_dirs.
for d in "${filtered_dirs[@]}"; do
    base=$(basename "$d")
    cleaned=$(clean_name "$base")
    added=false
    if [ "${#group_rep[@]}" -gt 0 ]; then
        # Compute similarities between the cleaned name and all group representatives concurrently.
        similarities=$(compute_similarities "$cleaned" "${group_rep[@]}")
        read -r -a sims <<< "$similarities"
        for i in "${!sims[@]}"; do
            if (( $(echo "${sims[$i]} >= $SIMILARITY_THRESHOLD" | bc -l) )); then
                groups["$i"]+=$'\n'"$d"
                added=true
                break
            fi
        done
    fi
    if [ "$added" = false ]; then
        new_index=${#group_rep[@]}
        group_rep+=("$cleaned")
        groups["$new_index"]="$d"
    fi
done

echo "=== Resolution Preference Filtering ==="
# For each group, if one directory contains "2160p" and another contains "1080p",
# remove the 1080p directory(ies).
for key in "${!groups[@]}"; do
    IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
    has_2160p=false
    has_1080p=false
    for path in "${paths[@]}"; do
        base=$(basename "$path")
        if echo "$base" | grep -qi "2160p"; then
            has_2160p=true
        fi
        if echo "$base" | grep -qi "1080p"; then
            has_1080p=true
        fi
    done
    if $has_2160p && $has_1080p; then
        echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
        new_group=()
        for path in "${paths[@]}"; do
            base=$(basename "$path")
            if echo "$base" | grep -qi "1080p"; then
                echo "Removing '$path' because a 2160p version is present."
                if $DRY_RUN; then
                    echo "Dry-run: would remove '$path'"
                else
                    rm -rf "$path"
                    echo "Removed '$path'"
                fi
            else
                new_group+=("$path")
            fi
        done
        groups["$key"]=$(printf "%s\n" "${new_group[@]}")
    fi
done

echo "=== Interactive Duplicate Resolution ==="
# For each group that still contains more than one directory, prompt the user to select one to remove.
for key in "${!groups[@]}"; do
    IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
    # Filter out directories that no longer exist.
    existing=()
    for path in "${paths[@]}"; do
        if [ -d "$path" ]; then
            existing+=("$path")
        fi
    done
    if [ "${#existing[@]}" -gt 1 ]; then
        echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):"
        i=1
        for p in "${existing[@]}"; do
            echo "  [$i] $p"
            ((i++))
        done
        echo -n "Enter the number of the directory you want to remove (or 0 to skip): "
        read -r choice
        if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then
            dir_to_remove="${existing[$((choice-1))]}"
            if $DRY_RUN; then
                echo "Dry-run: would remove '$dir_to_remove'"
            else
                rm -rf "$dir_to_remove"
                echo "Removed '$dir_to_remove'"
            fi
        else
            echo "No removal selected for this group."
        fi
    fi
done

echo "Script completed."