#!/bin/bash # compare_dirs.sh # # Usage: ./compare_dirs.sh [--dry-run] [--threshold ] # # This script: # 1. Scans immediate subdirectories in and . # 2. For each directory, if its name contains any undesirable word (one per line in ), # the directory is removed outright. # 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed) # and then grouped by fuzzy similarity using a configurable threshold. # The fuzzy similarity process is optimized with a multiprocessing helper. # 4. Within each group, if one directory’s name contains "2160p" and another contains "1080p", # the 1080p directory(ies) are removed (or flagged in dry-run mode). # 5. For any remaining duplicate groups, the user is prompted to select a directory to remove. # 6. A --dry-run mode is available to preview removals without actually deleting any directories. set -euo pipefail # Default options DRY_RUN=false SIMILARITY_THRESHOLD=0.8 # Process command-line flags while [[ "$1" == --* ]]; do case "$1" in --dry-run) DRY_RUN=true shift ;; --threshold) SIMILARITY_THRESHOLD="$2" shift 2 ;; *) echo "Unknown option: $1" exit 1 ;; esac done if [ "$#" -ne 3 ]; then echo "Usage: $0 [--dry-run] [--threshold ] " exit 1 fi DIR1="$1" DIR2="$2" WORDS_FILE="$3" # Verify input paths if [ ! -d "$DIR1" ]; then echo "Error: Directory '$DIR1' not found." exit 1 fi if [ ! -d "$DIR2" ]; then echo "Error: Directory '$DIR2' not found." exit 1 fi if [ ! -f "$WORDS_FILE" ]; then echo "Error: Words file '$WORDS_FILE' not found." exit 1 fi # Read undesirable words (one per line) into an array, ignoring blank lines. mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE") echo "=== Pre-filtering Directories by Undesirable Words ===" # Create an array to hold directories that do NOT match any undesirable word. filtered_dirs=() # Loop over immediate subdirectories in both DIR1 and DIR2. for d in "$DIR1"/* "$DIR2"/*; do if [ -d "$d" ]; then base=$(basename "$d") remove_flag=false # Check if the directory name contains any undesirable word (case-insensitive). for word in "${words[@]}"; do if echo "$base" | grep -qi "$word"; then remove_flag=true break fi done if $remove_flag; then echo "Removing '$d' because it contains an undesirable word." if $DRY_RUN; then echo "Dry-run: would remove '$d'" else rm -rf "$d" echo "Removed '$d'" fi else filtered_dirs+=("$d") fi fi done # Function: Normalize and clean a directory name. clean_name() { local name="$1" echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs } # Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing. compute_similarities() { local target="$1" shift # Pass target and the list of representatives as command-line arguments to Python. python3 - "$target" "$@" <= $SIMILARITY_THRESHOLD" | bc -l) )); then groups["$i"]+=$'\n'"$d" added=true break fi done fi if [ "$added" = false ]; then new_index=${#group_rep[@]} group_rep+=("$cleaned") groups["$new_index"]="$d" fi done echo "=== Resolution Preference Filtering ===" # For each group, if one directory contains "2160p" and another contains "1080p", # remove the 1080p directory(ies). for key in "${!groups[@]}"; do IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true has_2160p=false has_1080p=false for path in "${paths[@]}"; do base=$(basename "$path") if echo "$base" | grep -qi "2160p"; then has_2160p=true fi if echo "$base" | grep -qi "1080p"; then has_1080p=true fi done if $has_2160p && $has_1080p; then echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories." new_group=() for path in "${paths[@]}"; do base=$(basename "$path") if echo "$base" | grep -qi "1080p"; then echo "Removing '$path' because a 2160p version is present." if $DRY_RUN; then echo "Dry-run: would remove '$path'" else rm -rf "$path" echo "Removed '$path'" fi else new_group+=("$path") fi done groups["$key"]=$(printf "%s\n" "${new_group[@]}") fi done echo "=== Interactive Duplicate Resolution ===" # For each group that still contains more than one directory, prompt the user to select one to remove. for key in "${!groups[@]}"; do IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true # Filter out directories that no longer exist. existing=() for path in "${paths[@]}"; do if [ -d "$path" ]; then existing+=("$path") fi done if [ "${#existing[@]}" -gt 1 ]; then echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):" i=1 for p in "${existing[@]}"; do echo " [$i] $p" ((i++)) done echo -n "Enter the number of the directory you want to remove (or 0 to skip): " read -r choice if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -gt 0 ] && [ "$choice" -le "${#existing[@]}" ]; then dir_to_remove="${existing[$((choice-1))]}" if $DRY_RUN; then echo "Dry-run: would remove '$dir_to_remove'" else rm -rf "$dir_to_remove" echo "Removed '$dir_to_remove'" fi else echo "No removal selected for this group." fi fi done echo "Script completed."