225 lines
7.1 KiB
Bash
Executable File
225 lines
7.1 KiB
Bash
Executable File
#!/bin/bash
|
||
# compare_dirs.sh
|
||
#
|
||
# Usage: ./compare_dirs.sh [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>
|
||
#
|
||
# This script:
|
||
# 1. Scans immediate subdirectories in <dir1> and <dir2>.
|
||
# 2. For each directory, if its name contains any undesirable word (one per line in <words_file>),
|
||
# the directory is removed outright.
|
||
# 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
|
||
# and then grouped by fuzzy similarity using a configurable threshold.
|
||
# The fuzzy similarity process is optimized with a multiprocessing helper.
|
||
# 4. Within each group, if one directory’s name contains "2160p" and another contains "1080p",
|
||
# the 1080p directory(ies) are removed (or flagged in dry-run mode).
|
||
# 5. For any remaining duplicate groups, the user is prompted to select a directory to remove.
|
||
# 6. A --dry-run mode is available to preview removals without actually deleting any directories.
|
||
|
||
set -euo pipefail
|
||
|
||
# Default options
|
||
DRY_RUN=false
|
||
SIMILARITY_THRESHOLD=0.8
|
||
|
||
# Process command-line flags
|
||
while [[ "$1" == --* ]]; do
|
||
case "$1" in
|
||
--dry-run)
|
||
DRY_RUN=true
|
||
shift
|
||
;;
|
||
--threshold)
|
||
SIMILARITY_THRESHOLD="$2"
|
||
shift 2
|
||
;;
|
||
*)
|
||
echo "Unknown option: $1"
|
||
exit 1
|
||
;;
|
||
esac
|
||
done
|
||
|
||
if [ "$#" -ne 3 ]; then
|
||
echo "Usage: $0 [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>"
|
||
exit 1
|
||
fi
|
||
|
||
DIR1="$1"
|
||
DIR2="$2"
|
||
WORDS_FILE="$3"
|
||
|
||
# Verify input paths
|
||
if [ ! -d "$DIR1" ]; then
|
||
echo "Error: Directory '$DIR1' not found."
|
||
exit 1
|
||
fi
|
||
if [ ! -d "$DIR2" ]; then
|
||
echo "Error: Directory '$DIR2' not found."
|
||
exit 1
|
||
fi
|
||
if [ ! -f "$WORDS_FILE" ]; then
|
||
echo "Error: Words file '$WORDS_FILE' not found."
|
||
exit 1
|
||
fi
|
||
|
||
# Read undesirable words (one per line) into an array, ignoring blank lines.
|
||
mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE")
|
||
|
||
echo "=== Pre-filtering Directories by Undesirable Words ==="
|
||
# Create an array to hold directories that do NOT match any undesirable word.
|
||
filtered_dirs=()
|
||
|
||
# Loop over immediate subdirectories in both DIR1 and DIR2.
|
||
for d in "$DIR1"/* "$DIR2"/*; do
|
||
if [ -d "$d" ]; then
|
||
base=$(basename "$d")
|
||
remove_flag=false
|
||
# Check if the directory name contains any undesirable word (case-insensitive).
|
||
for word in "${words[@]}"; do
|
||
if echo "$base" | grep -qi "$word"; then
|
||
remove_flag=true
|
||
break
|
||
fi
|
||
done
|
||
if $remove_flag; then
|
||
echo "Removing '$d' because it contains an undesirable word."
|
||
if $DRY_RUN; then
|
||
echo "Dry-run: would remove '$d'"
|
||
else
|
||
rm -rf "$d"
|
||
echo "Removed '$d'"
|
||
fi
|
||
else
|
||
filtered_dirs+=("$d")
|
||
fi
|
||
fi
|
||
done
|
||
|
||
# Function: Normalize and clean a directory name.
|
||
clean_name() {
|
||
local name="$1"
|
||
echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs
|
||
}
|
||
|
||
# Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing.
|
||
compute_similarities() {
|
||
local target="$1"
|
||
shift
|
||
# Pass target and the list of representatives as command-line arguments to Python.
|
||
python3 - "$target" "$@" <<EOF
|
||
import sys
|
||
from difflib import SequenceMatcher
|
||
from multiprocessing import Pool
|
||
|
||
target = sys.argv[1]
|
||
reps = sys.argv[2:]
|
||
def similarity(rep):
|
||
return SequenceMatcher(None, target, rep).ratio()
|
||
with Pool() as pool:
|
||
results = pool.map(similarity, reps)
|
||
print(" ".join(map(str, results)))
|
||
EOF
|
||
}
|
||
|
||
echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
|
||
# Initialize grouping arrays.
|
||
declare -a group_rep=() # Array for representative cleaned names.
|
||
declare -A groups=() # Associative array: groups[i] holds newline-separated directory paths.
|
||
|
||
# Group the directories in filtered_dirs.
|
||
for d in "${filtered_dirs[@]}"; do
|
||
base=$(basename "$d")
|
||
cleaned=$(clean_name "$base")
|
||
added=false
|
||
if [ "${#group_rep[@]}" -gt 0 ]; then
|
||
# Compute similarities between the cleaned name and all group representatives concurrently.
|
||
similarities=$(compute_similarities "$cleaned" "${group_rep[@]}")
|
||
read -r -a sims <<< "$similarities"
|
||
for i in "${!sims[@]}"; do
|
||
if (( $(echo "${sims[$i]} >= $SIMILARITY_THRESHOLD" | bc -l) )); then
|
||
groups["$i"]+=$'\n'"$d"
|
||
added=true
|
||
break
|
||
fi
|
||
done
|
||
fi
|
||
if [ "$added" = false ]; then
|
||
new_index=${#group_rep[@]}
|
||
group_rep+=("$cleaned")
|
||
groups["$new_index"]="$d"
|
||
fi
|
||
done
|
||
|
||
echo "=== Resolution Preference Filtering ==="
|
||
# For each group, if one directory contains "2160p" and another contains "1080p",
|
||
# remove the 1080p directory(ies).
|
||
for key in "${!groups[@]}"; do
|
||
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
|
||
has_2160p=false
|
||
has_1080p=false
|
||
for path in "${paths[@]}"; do
|
||
base=$(basename "$path")
|
||
if echo "$base" | grep -qi "2160p"; then
|
||
has_2160p=true
|
||
fi
|
||
if echo "$base" | grep -qi "1080p"; then
|
||
has_1080p=true
|
||
fi
|
||
done
|
||
if $has_2160p && $has_1080p; then
|
||
echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
|
||
new_group=()
|
||
for path in "${paths[@]}"; do
|
||
base=$(basename "$path")
|
||
if echo "$base" | grep -qi "1080p"; then
|
||
echo "Removing '$path' because a 2160p version is present."
|
||
if $DRY_RUN; then
|
||
echo "Dry-run: would remove '$path'"
|
||
else
|
||
rm -rf "$path"
|
||
echo "Removed '$path'"
|
||
fi
|
||
else
|
||
new_group+=("$path")
|
||
fi
|
||
done
|
||
groups["$key"]=$(printf "%s\n" "${new_group[@]}")
|
||
fi
|
||
done
|
||
|
||
echo "=== Interactive Duplicate Resolution ==="
|
||
# For each group that still contains more than one directory, prompt the user to select one to remove.
|
||
for key in "${!groups[@]}"; do
|
||
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
|
||
# Filter out directories that no longer exist.
|
||
existing=()
|
||
for path in "${paths[@]}"; do
|
||
if [ -d "$path" ]; then
|
||
existing+=("$path")
|
||
fi
|
||
done
|
||
if [ "${#existing[@]}" -gt 1 ]; then
|
||
echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):"
|
||
i=1
|
||
for p in "${existing[@]}"; do
|
||
echo " [$i] $p"
|
||
((i++))
|
||
done
|
||
echo -n "Enter the number of the directory you want to remove (or 0 to skip): "
|
||
read -r choice
|
||
if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then
|
||
dir_to_remove="${existing[$((choice-1))]}"
|
||
if $DRY_RUN; then
|
||
echo "Dry-run: would remove '$dir_to_remove'"
|
||
else
|
||
rm -rf "$dir_to_remove"
|
||
echo "Removed '$dir_to_remove'"
|
||
fi
|
||
else
|
||
echo "No removal selected for this group."
|
||
fi
|
||
fi
|
||
done
|
||
|
||
echo "Script completed."
|