clean_dubs/compare_dirs.sh

225 lines
7.1 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# compare_dirs.sh
#
# Usage: ./compare_dirs.sh [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>
#
# This script:
# 1. Scans immediate subdirectories in <dir1> and <dir2>.
# 2. For each directory, if its name contains any undesirable word (one per line in <words_file>),
# the directory is removed outright.
# 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
# and then grouped by fuzzy similarity using a configurable threshold.
# The fuzzy similarity process is optimized with a multiprocessing helper.
# 4. Within each group, if one directorys name contains "2160p" and another contains "1080p",
# the 1080p directory(ies) are removed (or flagged in dry-run mode).
# 5. For any remaining duplicate groups, the user is prompted to select a directory to remove.
# 6. A --dry-run mode is available to preview removals without actually deleting any directories.
set -euo pipefail
# Default options
DRY_RUN=false
SIMILARITY_THRESHOLD=0.8
# Process command-line flags
while [[ "$1" == --* ]]; do
case "$1" in
--dry-run)
DRY_RUN=true
shift
;;
--threshold)
SIMILARITY_THRESHOLD="$2"
shift 2
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
if [ "$#" -ne 3 ]; then
echo "Usage: $0 [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>"
exit 1
fi
DIR1="$1"
DIR2="$2"
WORDS_FILE="$3"
# Verify input paths
if [ ! -d "$DIR1" ]; then
echo "Error: Directory '$DIR1' not found."
exit 1
fi
if [ ! -d "$DIR2" ]; then
echo "Error: Directory '$DIR2' not found."
exit 1
fi
if [ ! -f "$WORDS_FILE" ]; then
echo "Error: Words file '$WORDS_FILE' not found."
exit 1
fi
# Read undesirable words (one per line) into an array, ignoring blank lines.
mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE")
echo "=== Pre-filtering Directories by Undesirable Words ==="
# Create an array to hold directories that do NOT match any undesirable word.
filtered_dirs=()
# Loop over immediate subdirectories in both DIR1 and DIR2.
for d in "$DIR1"/* "$DIR2"/*; do
if [ -d "$d" ]; then
base=$(basename "$d")
remove_flag=false
# Check if the directory name contains any undesirable word (case-insensitive).
for word in "${words[@]}"; do
if echo "$base" | grep -qi "$word"; then
remove_flag=true
break
fi
done
if $remove_flag; then
echo "Removing '$d' because it contains an undesirable word."
if $DRY_RUN; then
echo "Dry-run: would remove '$d'"
else
rm -rf "$d"
echo "Removed '$d'"
fi
else
filtered_dirs+=("$d")
fi
fi
done
# Function: Normalize and clean a directory name.
clean_name() {
local name="$1"
echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs
}
# Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing.
compute_similarities() {
local target="$1"
shift
# Pass target and the list of representatives as command-line arguments to Python.
python3 - "$target" "$@" <<EOF
import sys
from difflib import SequenceMatcher
from multiprocessing import Pool
target = sys.argv[1]
reps = sys.argv[2:]
def similarity(rep):
return SequenceMatcher(None, target, rep).ratio()
with Pool() as pool:
results = pool.map(similarity, reps)
print(" ".join(map(str, results)))
EOF
}
echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
# Initialize grouping arrays.
declare -a group_rep=() # Array for representative cleaned names.
declare -A groups=() # Associative array: groups[i] holds newline-separated directory paths.
# Group the directories in filtered_dirs.
for d in "${filtered_dirs[@]}"; do
base=$(basename "$d")
cleaned=$(clean_name "$base")
added=false
if [ "${#group_rep[@]}" -gt 0 ]; then
# Compute similarities between the cleaned name and all group representatives concurrently.
similarities=$(compute_similarities "$cleaned" "${group_rep[@]}")
read -r -a sims <<< "$similarities"
for i in "${!sims[@]}"; do
if (( $(echo "${sims[$i]} >= $SIMILARITY_THRESHOLD" | bc -l) )); then
groups["$i"]+=$'\n'"$d"
added=true
break
fi
done
fi
if [ "$added" = false ]; then
new_index=${#group_rep[@]}
group_rep+=("$cleaned")
groups["$new_index"]="$d"
fi
done
echo "=== Resolution Preference Filtering ==="
# For each group, if one directory contains "2160p" and another contains "1080p",
# remove the 1080p directory(ies).
for key in "${!groups[@]}"; do
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
has_2160p=false
has_1080p=false
for path in "${paths[@]}"; do
base=$(basename "$path")
if echo "$base" | grep -qi "2160p"; then
has_2160p=true
fi
if echo "$base" | grep -qi "1080p"; then
has_1080p=true
fi
done
if $has_2160p && $has_1080p; then
echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
new_group=()
for path in "${paths[@]}"; do
base=$(basename "$path")
if echo "$base" | grep -qi "1080p"; then
echo "Removing '$path' because a 2160p version is present."
if $DRY_RUN; then
echo "Dry-run: would remove '$path'"
else
rm -rf "$path"
echo "Removed '$path'"
fi
else
new_group+=("$path")
fi
done
groups["$key"]=$(printf "%s\n" "${new_group[@]}")
fi
done
echo "=== Interactive Duplicate Resolution ==="
# For each group that still contains more than one directory, prompt the user to select one to remove.
for key in "${!groups[@]}"; do
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
# Filter out directories that no longer exist.
existing=()
for path in "${paths[@]}"; do
if [ -d "$path" ]; then
existing+=("$path")
fi
done
if [ "${#existing[@]}" -gt 1 ]; then
echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):"
i=1
for p in "${existing[@]}"; do
echo " [$i] $p"
((i++))
done
echo -n "Enter the number of the directory you want to remove (or 0 to skip): "
read -r choice
if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then
dir_to_remove="${existing[$((choice-1))]}"
if $DRY_RUN; then
echo "Dry-run: would remove '$dir_to_remove'"
else
rm -rf "$dir_to_remove"
echo "Removed '$dir_to_remove'"
fi
else
echo "No removal selected for this group."
fi
fi
done
echo "Script completed."