clean_dubs/compare_dirs_improved.sh
2025-02-28 11:35:18 +01:00

475 lines
16 KiB
Bash
Executable File

#!/bin/bash
# compare_dirs_improved.sh
#
# Usage: ./compare_dirs_improved.sh [--dry-run] [--threshold <threshold>] [--config <config_file>] [<dir1> <dir2> <words_file>]
#
# This script:
# 1. Scans immediate subdirectories in <dir1> and <dir2> in parallel.
# 2. For each directory, if its name contains any undesirable word (one per line in <words_file>),
# the directory is removed outright.
# 3. The remaining directories are "cleaned" (converted to lowercase, punctuation removed)
# and then grouped by fuzzy similarity using a configurable threshold.
# The fuzzy similarity process is optimized with a multiprocessing helper.
# 4. Within each group, if one directory's name contains "2160p" and another contains "1080p",
# the 1080p directory(ies) are removed (or flagged in dry-run mode).
# 5. For any remaining duplicate groups, the user is prompted to select a directory to remove.
# 6. A --dry-run mode is available to preview removals without actually deleting any directories.
# 7. Supports configuration files for persistent settings.
# 8. Provides comprehensive logging of operations.
set -euo pipefail
# Default configuration file location
CONFIG_FILE="./compare_dirs.conf"
# Initialize log function
log() {
local level="$1"
local message="$2"
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
# Only log if logging is enabled
if [[ "$LOG_ENABLED" == "true" ]]; then
# Log level filtering
case "$LOG_LEVEL" in
"DEBUG")
;;
"INFO")
if [[ "$level" == "DEBUG" ]]; then return; fi
;;
"WARNING")
if [[ "$level" == "DEBUG" || "$level" == "INFO" ]]; then return; fi
;;
"ERROR")
if [[ "$level" == "DEBUG" || "$level" == "INFO" || "$level" == "WARNING" ]]; then return; fi
;;
esac
# Print to console with color
case "$level" in
"DEBUG") echo -e "\033[36m[$timestamp] [$level] $message\033[0m" ;; # Cyan
"INFO") echo -e "\033[32m[$timestamp] [$level] $message\033[0m" ;; # Green
"WARNING") echo -e "\033[33m[$timestamp] [$level] $message\033[0m" ;; # Yellow
"ERROR") echo -e "\033[31m[$timestamp] [$level] $message\033[0m" ;; # Red
*) echo "[$timestamp] [$level] $message" ;;
esac
# Write to log file (without color codes)
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
fi
}
# Default options
DRY_RUN=false
SIMILARITY_THRESHOLD=0.8
LOG_ENABLED=true
LOG_FILE="./compare_dirs.log"
LOG_LEVEL="INFO"
PARALLEL_PROCESSES=0
# Load configuration file if it exists
load_config() {
local config_file="$1"
if [[ -f "$config_file" ]]; then
# Source the config file
source "$config_file"
echo "Configuration loaded from $config_file"
return 0
else
echo "Warning: Configuration file '$config_file' not found. Using defaults."
return 1
fi
}
# Process directories in parallel
process_directories_parallel() {
local dir="$1"
local max_procs="${2:-4}" # Default to 4 processes if not specified
local temp_dir=$(mktemp -d)
local pids=()
local count=0
local result=()
if [[ ! -d "$dir" ]]; then
log "ERROR" "Directory '$dir' not found in parallel processing."
return 1
}
# If PARALLEL_PROCESSES is 0, use available CPU cores
if [[ "$max_procs" -eq 0 ]]; then
max_procs=$(nproc 2>/dev/null || echo 4)
fi
log "DEBUG" "Processing directory '$dir' with $max_procs parallel processes"
# Get all directories to process
local all_dirs=()
for d in "$dir"/*; do
if [[ -d "$d" ]]; then
all_dirs+=("$d")
fi
done
local total_dirs=${#all_dirs[@]}
log "DEBUG" "Found $total_dirs directories to process"
# Process in batches based on max_procs
for ((i=0; i<total_dirs; i++)); do
local d="${all_dirs[i]}"
local base=$(basename "$d")
local output_file="$temp_dir/$count"
# Process directory in background
{
remove_flag=false
# Check if the directory name contains any undesirable word (case-insensitive)
for word in "${words[@]}"; do
if echo "$base" | grep -qi "$word"; then
remove_flag=true
break
fi
done
if $remove_flag; then
echo "REMOVE:$d"
else
echo "KEEP:$d"
fi
} > "$output_file" &
pids+=($!)
((count++))
# If we've reached max_procs or this is the last directory, wait for processes to finish
if [[ ${#pids[@]} -eq $max_procs || $i -eq $((total_dirs-1)) ]]; then
for pid in "${pids[@]}"; do
wait "$pid"
done
# Read results
for ((j=0; j<${#pids[@]}; j++)); do
local file="$temp_dir/$j"
while IFS= read -r line; do
result+=("$line")
done < "$file"
done
# Reset for next batch
pids=()
count=0
fi
done
# Clean up temporary directory
rm -rf "$temp_dir"
# Output results
for line in "${result[@]}"; do
echo "$line"
done
}
# Process command-line flags
while [[ $# -gt 0 && "$1" == --* ]]; do
case "$1" in
--dry-run)
DRY_RUN=true
shift
;;
--threshold)
SIMILARITY_THRESHOLD="$2"
shift 2
;;
--config)
CONFIG_FILE="$2"
shift 2
;;
--log-file)
LOG_FILE="$2"
shift 2
;;
--log-level)
LOG_LEVEL="$2"
shift 2
;;
--parallel)
PARALLEL_PROCESSES="$2"
shift 2
;;
--help)
echo "Usage: $0 [OPTIONS] [<dir1> <dir2> <words_file>]"
echo
echo "OPTIONS:"
echo " --dry-run Preview actions without deleting any directories"
echo " --threshold <value> Set the fuzzy similarity threshold (default: 0.8)"
echo " --config <file> Specify a configuration file to use"
echo " --log-file <file> Specify a log file to use"
echo " --log-level <level> Set log level: DEBUG, INFO, WARNING, ERROR"
echo " --parallel <num> Number of parallel processes (0 = auto)"
echo " --help Display this help message"
echo
echo "If no directories and words file are specified, values from config file will be used."
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information."
exit 1
;;
esac
done
# Load configuration file if specified
if [[ -f "$CONFIG_FILE" ]]; then
load_config "$CONFIG_FILE"
fi
# Initialize logging
if [[ "$LOG_ENABLED" == "true" ]]; then
touch "$LOG_FILE"
log "INFO" "Logging initialized"
fi
# Check if arguments were provided to override config values
if [ $# -eq 3 ]; then
DIR1="$1"
DIR2="$2"
WORDS_FILE="$3"
log "INFO" "Using command-line arguments for directories and words file"
elif [ $# -ne 0 ]; then
log "ERROR" "Incorrect number of arguments"
echo "Usage: $0 [--dry-run] [--threshold <threshold>] [--config <config_file>] [<dir1> <dir2> <words_file>]"
exit 1
fi
log "INFO" "Script started with parameters: DIR1=$DIR1, DIR2=$DIR2, WORDS_FILE=$WORDS_FILE"
log "INFO" "Configuration: THRESHOLD=$SIMILARITY_THRESHOLD, DRY_RUN=$DRY_RUN, PARALLEL_PROCESSES=$PARALLEL_PROCESSES"
# Verify input paths
if [ ! -d "$DIR1" ]; then
log "ERROR" "Directory '$DIR1' not found."
echo "Error: Directory '$DIR1' not found."
exit 1
fi
if [ ! -d "$DIR2" ]; then
log "ERROR" "Directory '$DIR2' not found."
echo "Error: Directory '$DIR2' not found."
exit 1
fi
if [ ! -f "$WORDS_FILE" ]; then
log "ERROR" "Words file '$WORDS_FILE' not found."
echo "Error: Words file '$WORDS_FILE' not found."
exit 1
fi
# Read undesirable words (one per line) into an array, ignoring blank lines.
mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE")
log "INFO" "Loaded ${#words[@]} undesirable words from $WORDS_FILE"
echo "=== Pre-filtering Directories by Undesirable Words ==="
log "INFO" "Starting parallel directory filtering"
# Process directories in parallel and process results
filtered_dirs=()
# Process DIR1
log "INFO" "Processing directories in $DIR1"
while IFS= read -r line; do
if [[ "$line" == KEEP:* ]]; then
dir="${line#KEEP:}"
filtered_dirs+=("$dir")
log "DEBUG" "Keeping directory: $dir"
elif [[ "$line" == REMOVE:* ]]; then
dir="${line#REMOVE:}"
log "INFO" "Removing directory with undesirable word: $dir"
if $DRY_RUN; then
echo "Dry-run: would remove '$dir'"
log "INFO" "Dry-run: would remove '$dir'"
else
rm -rf "$dir"
log "INFO" "Removed '$dir'"
echo "Removed '$dir'"
fi
fi
done < <(process_directories_parallel "$DIR1" "$PARALLEL_PROCESSES")
# Process DIR2
log "INFO" "Processing directories in $DIR2"
while IFS= read -r line; do
if [[ "$line" == KEEP:* ]]; then
dir="${line#KEEP:}"
filtered_dirs+=("$dir")
log "DEBUG" "Keeping directory: $dir"
elif [[ "$line" == REMOVE:* ]]; then
dir="${line#REMOVE:}"
log "INFO" "Removing directory with undesirable word: $dir"
if $DRY_RUN; then
echo "Dry-run: would remove '$dir'"
log "INFO" "Dry-run: would remove '$dir'"
else
rm -rf "$dir"
log "INFO" "Removed '$dir'"
echo "Removed '$dir'"
fi
fi
done < <(process_directories_parallel "$DIR2" "$PARALLEL_PROCESSES")
log "INFO" "Filtered directories remaining: ${#filtered_dirs[@]}"
# Function: Normalize and clean a directory name.
clean_name() {
local name="$1"
echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs
}
# Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing.
compute_similarities() {
local target="$1"
shift
# Pass target and the list of representatives as command-line arguments to Python.
python3 - "$target" "$@" <<EOF
import sys
from difflib import SequenceMatcher
from multiprocessing import Pool, cpu_count
target = sys.argv[1]
reps = sys.argv[2:]
def similarity(rep):
return SequenceMatcher(None, target, rep).ratio()
# Use all available CPUs
with Pool(processes=cpu_count()) as pool:
results = pool.map(similarity, reps)
print(" ".join(map(str, results)))
EOF
}
echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
log "INFO" "Starting fuzzy similarity grouping"
# Initialize grouping arrays.
declare -a group_rep=() # Array for representative cleaned names.
declare -A groups=() # Associative array: groups[i] holds newline-separated directory paths.
# Group the directories in filtered_dirs.
total_dirs=${#filtered_dirs[@]}
log "INFO" "Grouping $total_dirs directories based on similarity threshold $SIMILARITY_THRESHOLD"
for d in "${filtered_dirs[@]}"; do
base=$(basename "$d")
cleaned=$(clean_name "$base")
added=false
if [ "${#group_rep[@]}" -gt 0 ]; then
# Compute similarities between the cleaned name and all group representatives concurrently.
similarities=$(compute_similarities "$cleaned" "${group_rep[@]}")
read -r -a sims <<< "$similarities"
for i in "${!sims[@]}"; do
if (( $(echo "${sims[$i]} >= $SIMILARITY_THRESHOLD" | bc -l) )); then
groups["$i"]+=$'\n'"$d"
log "DEBUG" "Added '$d' to group $i (${group_rep[$i]})"
added=true
break
fi
done
fi
if [ "$added" = false ]; then
new_index=${#group_rep[@]}
group_rep+=("$cleaned")
groups["$new_index"]="$d"
log "DEBUG" "Created new group $new_index with representative '$cleaned'"
fi
done
log "INFO" "Created ${#group_rep[@]} groups after fuzzy similarity matching"
echo "=== Resolution Preference Filtering ==="
log "INFO" "Starting resolution preference filtering"
# For each group, if one directory contains "2160p" and another contains "1080p",
# remove the 1080p directory(ies).
for key in "${!groups[@]}"; do
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
has_2160p=false
has_1080p=false
for path in "${paths[@]}"; do
base=$(basename "$path")
if echo "$base" | grep -qi "2160p"; then
has_2160p=true
log "DEBUG" "Found 2160p in group $key: $path"
fi
if echo "$base" | grep -qi "1080p"; then
has_1080p=true
log "DEBUG" "Found 1080p in group $key: $path"
fi
done
if $has_2160p && $has_1080p; then
log "INFO" "Group $key (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories"
echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
new_group=()
for path in "${paths[@]}"; do
base=$(basename "$path")
if echo "$base" | grep -qi "1080p"; then
log "INFO" "Removing '$path' because a 2160p version is present"
echo "Removing '$path' because a 2160p version is present."
if $DRY_RUN; then
echo "Dry-run: would remove '$path'"
log "INFO" "Dry-run: would remove '$path'"
else
rm -rf "$path"
log "INFO" "Removed '$path'"
echo "Removed '$path'"
fi
else
new_group+=("$path")
fi
done
groups["$key"]=$(printf "%s\n" "${new_group[@]}")
fi
done
echo "=== Interactive Duplicate Resolution ==="
log "INFO" "Starting interactive duplicate resolution"
# For each group that still contains more than one directory, prompt the user to select one to remove.
for key in "${!groups[@]}"; do
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
# Filter out directories that no longer exist.
existing=()
for path in "${paths[@]}"; do
if [ -d "$path" ]; then
existing+=("$path")
fi
done
if [ "${#existing[@]}" -gt 1 ]; then
log "INFO" "Prompting user for duplicate group: ${group_rep[$key]:-unknown}"
echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):"
i=1
for p in "${existing[@]}"; do
echo " [$i] $p"
((i++))
done
echo -n "Enter the number of the directory you want to remove (or 0 to skip): "
read -r choice
if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then
dir_to_remove="${existing[$((choice-1))]}"
log "INFO" "User selected to remove: $dir_to_remove"
if $DRY_RUN; then
echo "Dry-run: would remove '$dir_to_remove'"
log "INFO" "Dry-run: would remove '$dir_to_remove'"
else
rm -rf "$dir_to_remove"
log "INFO" "Removed '$dir_to_remove'"
echo "Removed '$dir_to_remove'"
fi
else
log "INFO" "User skipped removal for group: ${group_rep[$key]:-unknown}"
echo "No removal selected for this group."
fi
fi
done
log "INFO" "Script completed successfully"
echo "Script completed. See $LOG_FILE for detailed log."