475 lines
16 KiB
Bash
Executable File
475 lines
16 KiB
Bash
Executable File
#!/bin/bash
|
|
# compare_dirs_improved.sh
|
|
#
|
|
# Usage: ./compare_dirs_improved.sh [--dry-run] [--threshold <threshold>] [--config <config_file>] [<dir1> <dir2> <words_file>]
|
|
#
|
|
# This script:
|
|
# 1. Scans immediate subdirectories in <dir1> and <dir2> in parallel.
|
|
# 2. For each directory, if its name contains any undesirable word (one per line in <words_file>),
|
|
# the directory is removed outright.
|
|
# 3. The remaining directories are "cleaned" (converted to lowercase, punctuation removed)
|
|
# and then grouped by fuzzy similarity using a configurable threshold.
|
|
# The fuzzy similarity process is optimized with a multiprocessing helper.
|
|
# 4. Within each group, if one directory's name contains "2160p" and another contains "1080p",
|
|
# the 1080p directory(ies) are removed (or flagged in dry-run mode).
|
|
# 5. For any remaining duplicate groups, the user is prompted to select a directory to remove.
|
|
# 6. A --dry-run mode is available to preview removals without actually deleting any directories.
|
|
# 7. Supports configuration files for persistent settings.
|
|
# 8. Provides comprehensive logging of operations.
|
|
|
|
set -euo pipefail
|
|
|
|
# Default configuration file location
|
|
CONFIG_FILE="./compare_dirs.conf"
|
|
|
|
# Initialize log function
|
|
log() {
|
|
local level="$1"
|
|
local message="$2"
|
|
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
|
|
|
|
# Only log if logging is enabled
|
|
if [[ "$LOG_ENABLED" == "true" ]]; then
|
|
# Log level filtering
|
|
case "$LOG_LEVEL" in
|
|
"DEBUG")
|
|
;;
|
|
"INFO")
|
|
if [[ "$level" == "DEBUG" ]]; then return; fi
|
|
;;
|
|
"WARNING")
|
|
if [[ "$level" == "DEBUG" || "$level" == "INFO" ]]; then return; fi
|
|
;;
|
|
"ERROR")
|
|
if [[ "$level" == "DEBUG" || "$level" == "INFO" || "$level" == "WARNING" ]]; then return; fi
|
|
;;
|
|
esac
|
|
|
|
# Print to console with color
|
|
case "$level" in
|
|
"DEBUG") echo -e "\033[36m[$timestamp] [$level] $message\033[0m" ;; # Cyan
|
|
"INFO") echo -e "\033[32m[$timestamp] [$level] $message\033[0m" ;; # Green
|
|
"WARNING") echo -e "\033[33m[$timestamp] [$level] $message\033[0m" ;; # Yellow
|
|
"ERROR") echo -e "\033[31m[$timestamp] [$level] $message\033[0m" ;; # Red
|
|
*) echo "[$timestamp] [$level] $message" ;;
|
|
esac
|
|
|
|
# Write to log file (without color codes)
|
|
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
|
|
fi
|
|
}
|
|
|
|
# Default options
|
|
DRY_RUN=false
|
|
SIMILARITY_THRESHOLD=0.8
|
|
LOG_ENABLED=true
|
|
LOG_FILE="./compare_dirs.log"
|
|
LOG_LEVEL="INFO"
|
|
PARALLEL_PROCESSES=0
|
|
|
|
# Load configuration file if it exists
|
|
load_config() {
|
|
local config_file="$1"
|
|
|
|
if [[ -f "$config_file" ]]; then
|
|
# Source the config file
|
|
source "$config_file"
|
|
echo "Configuration loaded from $config_file"
|
|
return 0
|
|
else
|
|
echo "Warning: Configuration file '$config_file' not found. Using defaults."
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Process directories in parallel
|
|
process_directories_parallel() {
|
|
local dir="$1"
|
|
local max_procs="${2:-4}" # Default to 4 processes if not specified
|
|
local temp_dir=$(mktemp -d)
|
|
local pids=()
|
|
local count=0
|
|
local result=()
|
|
|
|
if [[ ! -d "$dir" ]]; then
|
|
log "ERROR" "Directory '$dir' not found in parallel processing."
|
|
return 1
|
|
}
|
|
|
|
# If PARALLEL_PROCESSES is 0, use available CPU cores
|
|
if [[ "$max_procs" -eq 0 ]]; then
|
|
max_procs=$(nproc 2>/dev/null || echo 4)
|
|
fi
|
|
|
|
log "DEBUG" "Processing directory '$dir' with $max_procs parallel processes"
|
|
|
|
# Get all directories to process
|
|
local all_dirs=()
|
|
for d in "$dir"/*; do
|
|
if [[ -d "$d" ]]; then
|
|
all_dirs+=("$d")
|
|
fi
|
|
done
|
|
|
|
local total_dirs=${#all_dirs[@]}
|
|
log "DEBUG" "Found $total_dirs directories to process"
|
|
|
|
# Process in batches based on max_procs
|
|
for ((i=0; i<total_dirs; i++)); do
|
|
local d="${all_dirs[i]}"
|
|
local base=$(basename "$d")
|
|
local output_file="$temp_dir/$count"
|
|
|
|
# Process directory in background
|
|
{
|
|
remove_flag=false
|
|
# Check if the directory name contains any undesirable word (case-insensitive)
|
|
for word in "${words[@]}"; do
|
|
if echo "$base" | grep -qi "$word"; then
|
|
remove_flag=true
|
|
break
|
|
fi
|
|
done
|
|
|
|
if $remove_flag; then
|
|
echo "REMOVE:$d"
|
|
else
|
|
echo "KEEP:$d"
|
|
fi
|
|
} > "$output_file" &
|
|
|
|
pids+=($!)
|
|
((count++))
|
|
|
|
# If we've reached max_procs or this is the last directory, wait for processes to finish
|
|
if [[ ${#pids[@]} -eq $max_procs || $i -eq $((total_dirs-1)) ]]; then
|
|
for pid in "${pids[@]}"; do
|
|
wait "$pid"
|
|
done
|
|
|
|
# Read results
|
|
for ((j=0; j<${#pids[@]}; j++)); do
|
|
local file="$temp_dir/$j"
|
|
while IFS= read -r line; do
|
|
result+=("$line")
|
|
done < "$file"
|
|
done
|
|
|
|
# Reset for next batch
|
|
pids=()
|
|
count=0
|
|
fi
|
|
done
|
|
|
|
# Clean up temporary directory
|
|
rm -rf "$temp_dir"
|
|
|
|
# Output results
|
|
for line in "${result[@]}"; do
|
|
echo "$line"
|
|
done
|
|
}
|
|
|
|
# Process command-line flags
|
|
while [[ $# -gt 0 && "$1" == --* ]]; do
|
|
case "$1" in
|
|
--dry-run)
|
|
DRY_RUN=true
|
|
shift
|
|
;;
|
|
--threshold)
|
|
SIMILARITY_THRESHOLD="$2"
|
|
shift 2
|
|
;;
|
|
--config)
|
|
CONFIG_FILE="$2"
|
|
shift 2
|
|
;;
|
|
--log-file)
|
|
LOG_FILE="$2"
|
|
shift 2
|
|
;;
|
|
--log-level)
|
|
LOG_LEVEL="$2"
|
|
shift 2
|
|
;;
|
|
--parallel)
|
|
PARALLEL_PROCESSES="$2"
|
|
shift 2
|
|
;;
|
|
--help)
|
|
echo "Usage: $0 [OPTIONS] [<dir1> <dir2> <words_file>]"
|
|
echo
|
|
echo "OPTIONS:"
|
|
echo " --dry-run Preview actions without deleting any directories"
|
|
echo " --threshold <value> Set the fuzzy similarity threshold (default: 0.8)"
|
|
echo " --config <file> Specify a configuration file to use"
|
|
echo " --log-file <file> Specify a log file to use"
|
|
echo " --log-level <level> Set log level: DEBUG, INFO, WARNING, ERROR"
|
|
echo " --parallel <num> Number of parallel processes (0 = auto)"
|
|
echo " --help Display this help message"
|
|
echo
|
|
echo "If no directories and words file are specified, values from config file will be used."
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1"
|
|
echo "Use --help for usage information."
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Load configuration file if specified
|
|
if [[ -f "$CONFIG_FILE" ]]; then
|
|
load_config "$CONFIG_FILE"
|
|
fi
|
|
|
|
# Initialize logging
|
|
if [[ "$LOG_ENABLED" == "true" ]]; then
|
|
touch "$LOG_FILE"
|
|
log "INFO" "Logging initialized"
|
|
fi
|
|
|
|
# Check if arguments were provided to override config values
|
|
if [ $# -eq 3 ]; then
|
|
DIR1="$1"
|
|
DIR2="$2"
|
|
WORDS_FILE="$3"
|
|
log "INFO" "Using command-line arguments for directories and words file"
|
|
elif [ $# -ne 0 ]; then
|
|
log "ERROR" "Incorrect number of arguments"
|
|
echo "Usage: $0 [--dry-run] [--threshold <threshold>] [--config <config_file>] [<dir1> <dir2> <words_file>]"
|
|
exit 1
|
|
fi
|
|
|
|
log "INFO" "Script started with parameters: DIR1=$DIR1, DIR2=$DIR2, WORDS_FILE=$WORDS_FILE"
|
|
log "INFO" "Configuration: THRESHOLD=$SIMILARITY_THRESHOLD, DRY_RUN=$DRY_RUN, PARALLEL_PROCESSES=$PARALLEL_PROCESSES"
|
|
|
|
# Verify input paths
|
|
if [ ! -d "$DIR1" ]; then
|
|
log "ERROR" "Directory '$DIR1' not found."
|
|
echo "Error: Directory '$DIR1' not found."
|
|
exit 1
|
|
fi
|
|
if [ ! -d "$DIR2" ]; then
|
|
log "ERROR" "Directory '$DIR2' not found."
|
|
echo "Error: Directory '$DIR2' not found."
|
|
exit 1
|
|
fi
|
|
if [ ! -f "$WORDS_FILE" ]; then
|
|
log "ERROR" "Words file '$WORDS_FILE' not found."
|
|
echo "Error: Words file '$WORDS_FILE' not found."
|
|
exit 1
|
|
fi
|
|
|
|
# Read undesirable words (one per line) into an array, ignoring blank lines.
|
|
mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE")
|
|
log "INFO" "Loaded ${#words[@]} undesirable words from $WORDS_FILE"
|
|
|
|
echo "=== Pre-filtering Directories by Undesirable Words ==="
|
|
log "INFO" "Starting parallel directory filtering"
|
|
|
|
# Process directories in parallel and process results
|
|
filtered_dirs=()
|
|
|
|
# Process DIR1
|
|
log "INFO" "Processing directories in $DIR1"
|
|
while IFS= read -r line; do
|
|
if [[ "$line" == KEEP:* ]]; then
|
|
dir="${line#KEEP:}"
|
|
filtered_dirs+=("$dir")
|
|
log "DEBUG" "Keeping directory: $dir"
|
|
elif [[ "$line" == REMOVE:* ]]; then
|
|
dir="${line#REMOVE:}"
|
|
log "INFO" "Removing directory with undesirable word: $dir"
|
|
if $DRY_RUN; then
|
|
echo "Dry-run: would remove '$dir'"
|
|
log "INFO" "Dry-run: would remove '$dir'"
|
|
else
|
|
rm -rf "$dir"
|
|
log "INFO" "Removed '$dir'"
|
|
echo "Removed '$dir'"
|
|
fi
|
|
fi
|
|
done < <(process_directories_parallel "$DIR1" "$PARALLEL_PROCESSES")
|
|
|
|
# Process DIR2
|
|
log "INFO" "Processing directories in $DIR2"
|
|
while IFS= read -r line; do
|
|
if [[ "$line" == KEEP:* ]]; then
|
|
dir="${line#KEEP:}"
|
|
filtered_dirs+=("$dir")
|
|
log "DEBUG" "Keeping directory: $dir"
|
|
elif [[ "$line" == REMOVE:* ]]; then
|
|
dir="${line#REMOVE:}"
|
|
log "INFO" "Removing directory with undesirable word: $dir"
|
|
if $DRY_RUN; then
|
|
echo "Dry-run: would remove '$dir'"
|
|
log "INFO" "Dry-run: would remove '$dir'"
|
|
else
|
|
rm -rf "$dir"
|
|
log "INFO" "Removed '$dir'"
|
|
echo "Removed '$dir'"
|
|
fi
|
|
fi
|
|
done < <(process_directories_parallel "$DIR2" "$PARALLEL_PROCESSES")
|
|
|
|
log "INFO" "Filtered directories remaining: ${#filtered_dirs[@]}"
|
|
|
|
# Function: Normalize and clean a directory name.
|
|
clean_name() {
|
|
local name="$1"
|
|
echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs
|
|
}
|
|
|
|
# Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing.
|
|
compute_similarities() {
|
|
local target="$1"
|
|
shift
|
|
# Pass target and the list of representatives as command-line arguments to Python.
|
|
python3 - "$target" "$@" <<EOF
|
|
import sys
|
|
from difflib import SequenceMatcher
|
|
from multiprocessing import Pool, cpu_count
|
|
|
|
target = sys.argv[1]
|
|
reps = sys.argv[2:]
|
|
|
|
def similarity(rep):
|
|
return SequenceMatcher(None, target, rep).ratio()
|
|
|
|
# Use all available CPUs
|
|
with Pool(processes=cpu_count()) as pool:
|
|
results = pool.map(similarity, reps)
|
|
|
|
print(" ".join(map(str, results)))
|
|
EOF
|
|
}
|
|
|
|
echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
|
|
log "INFO" "Starting fuzzy similarity grouping"
|
|
|
|
# Initialize grouping arrays.
|
|
declare -a group_rep=() # Array for representative cleaned names.
|
|
declare -A groups=() # Associative array: groups[i] holds newline-separated directory paths.
|
|
|
|
# Group the directories in filtered_dirs.
|
|
total_dirs=${#filtered_dirs[@]}
|
|
log "INFO" "Grouping $total_dirs directories based on similarity threshold $SIMILARITY_THRESHOLD"
|
|
|
|
for d in "${filtered_dirs[@]}"; do
|
|
base=$(basename "$d")
|
|
cleaned=$(clean_name "$base")
|
|
added=false
|
|
if [ "${#group_rep[@]}" -gt 0 ]; then
|
|
# Compute similarities between the cleaned name and all group representatives concurrently.
|
|
similarities=$(compute_similarities "$cleaned" "${group_rep[@]}")
|
|
read -r -a sims <<< "$similarities"
|
|
for i in "${!sims[@]}"; do
|
|
if (( $(echo "${sims[$i]} >= $SIMILARITY_THRESHOLD" | bc -l) )); then
|
|
groups["$i"]+=$'\n'"$d"
|
|
log "DEBUG" "Added '$d' to group $i (${group_rep[$i]})"
|
|
added=true
|
|
break
|
|
fi
|
|
done
|
|
fi
|
|
if [ "$added" = false ]; then
|
|
new_index=${#group_rep[@]}
|
|
group_rep+=("$cleaned")
|
|
groups["$new_index"]="$d"
|
|
log "DEBUG" "Created new group $new_index with representative '$cleaned'"
|
|
fi
|
|
done
|
|
|
|
log "INFO" "Created ${#group_rep[@]} groups after fuzzy similarity matching"
|
|
|
|
echo "=== Resolution Preference Filtering ==="
|
|
log "INFO" "Starting resolution preference filtering"
|
|
|
|
# For each group, if one directory contains "2160p" and another contains "1080p",
|
|
# remove the 1080p directory(ies).
|
|
for key in "${!groups[@]}"; do
|
|
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
|
|
has_2160p=false
|
|
has_1080p=false
|
|
for path in "${paths[@]}"; do
|
|
base=$(basename "$path")
|
|
if echo "$base" | grep -qi "2160p"; then
|
|
has_2160p=true
|
|
log "DEBUG" "Found 2160p in group $key: $path"
|
|
fi
|
|
if echo "$base" | grep -qi "1080p"; then
|
|
has_1080p=true
|
|
log "DEBUG" "Found 1080p in group $key: $path"
|
|
fi
|
|
done
|
|
if $has_2160p && $has_1080p; then
|
|
log "INFO" "Group $key (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories"
|
|
echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
|
|
new_group=()
|
|
for path in "${paths[@]}"; do
|
|
base=$(basename "$path")
|
|
if echo "$base" | grep -qi "1080p"; then
|
|
log "INFO" "Removing '$path' because a 2160p version is present"
|
|
echo "Removing '$path' because a 2160p version is present."
|
|
if $DRY_RUN; then
|
|
echo "Dry-run: would remove '$path'"
|
|
log "INFO" "Dry-run: would remove '$path'"
|
|
else
|
|
rm -rf "$path"
|
|
log "INFO" "Removed '$path'"
|
|
echo "Removed '$path'"
|
|
fi
|
|
else
|
|
new_group+=("$path")
|
|
fi
|
|
done
|
|
groups["$key"]=$(printf "%s\n" "${new_group[@]}")
|
|
fi
|
|
done
|
|
|
|
echo "=== Interactive Duplicate Resolution ==="
|
|
log "INFO" "Starting interactive duplicate resolution"
|
|
|
|
# For each group that still contains more than one directory, prompt the user to select one to remove.
|
|
for key in "${!groups[@]}"; do
|
|
IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
|
|
# Filter out directories that no longer exist.
|
|
existing=()
|
|
for path in "${paths[@]}"; do
|
|
if [ -d "$path" ]; then
|
|
existing+=("$path")
|
|
fi
|
|
done
|
|
if [ "${#existing[@]}" -gt 1 ]; then
|
|
log "INFO" "Prompting user for duplicate group: ${group_rep[$key]:-unknown}"
|
|
echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):"
|
|
i=1
|
|
for p in "${existing[@]}"; do
|
|
echo " [$i] $p"
|
|
((i++))
|
|
done
|
|
echo -n "Enter the number of the directory you want to remove (or 0 to skip): "
|
|
read -r choice
|
|
if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then
|
|
dir_to_remove="${existing[$((choice-1))]}"
|
|
log "INFO" "User selected to remove: $dir_to_remove"
|
|
if $DRY_RUN; then
|
|
echo "Dry-run: would remove '$dir_to_remove'"
|
|
log "INFO" "Dry-run: would remove '$dir_to_remove'"
|
|
else
|
|
rm -rf "$dir_to_remove"
|
|
log "INFO" "Removed '$dir_to_remove'"
|
|
echo "Removed '$dir_to_remove'"
|
|
fi
|
|
else
|
|
log "INFO" "User skipped removal for group: ${group_rep[$key]:-unknown}"
|
|
echo "No removal selected for this group."
|
|
fi
|
|
fi
|
|
done
|
|
|
|
log "INFO" "Script completed successfully"
|
|
echo "Script completed. See $LOG_FILE for detailed log." |