diff --git a/compare b/compare new file mode 100755 index 0000000..a7662c1 --- /dev/null +++ b/compare @@ -0,0 +1,190 @@ +#!/bin/bash +# compare_dirs.sh +# +# Usage: ./compare_dirs.sh [--dry-run] [--threshold ] +# +# This script: +# - Scans immediate subdirectories in and +# - Normalizes and “cleans” their names by removing punctuation, converting to lower-case, +# trimming whitespace, and removing any words listed in +# - Groups directories using a fuzzy matching algorithm (with a configurable threshold) +# - Automatically removes a directory from a duplicate group if its original name contains an +# undesirable word while an alternative does not +# - Then automatically removes any remaining duplicates in each group (keeping the first directory) +# - Supports a --dry-run mode that shows actions without deleting directories + +set -euo pipefail + +# Default options +DRY_RUN=false +SIMILARITY_THRESHOLD=0.8 + +# Process command-line flags +while [[ "$1" == --* ]]; do + case "$1" in + --dry-run) + DRY_RUN=true + shift + ;; + --threshold) + SIMILARITY_THRESHOLD="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [ "$#" -ne 3 ]; then + echo "Usage: $0 [--dry-run] [--threshold ] " + exit 1 +fi + +DIR1="$1" +DIR2="$2" +WORDS_FILE="$3" + +# Verify input paths +if [ ! -d "$DIR1" ]; then + echo "Error: Directory '$DIR1' not found." + exit 1 +fi +if [ ! -d "$DIR2" ]; then + echo "Error: Directory '$DIR2' not found." + exit 1 +fi +if [ ! -f "$WORDS_FILE" ]; then + echo "Error: Words file '$WORDS_FILE' not found." + exit 1 +fi + +# Read undesirable words (one per line) into an array, filtering out blank lines. +mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE") + +# Function: Normalize and clean a directory name. +clean_name() { + local name="$1" + # Normalize: convert to lower-case, remove punctuation, and trim extra whitespace. + local normalized + normalized=$(echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs) + local cleaned="$normalized" + for word in "${words[@]}"; do + # Remove the word (case-insensitive). + cleaned=$(echo "$cleaned" | sed "s/$(echo "$word" | tr '[:upper:]' '[:lower:]')//Ig") + done + # Trim again in case extra spaces were left. + cleaned=$(echo "$cleaned" | xargs) + echo "$cleaned" +} + +# Function: Compute fuzzy similarity between two names using Python's difflib. +compute_similarity() { + local name1="$1" + local name2="$2" + python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2" +} + +# Initialize grouping arrays explicitly. +declare -a group_rep=() # Array for representative cleaned names. +declare -A groups=() # Associative array: groups[i] holds newline-separated directory paths. + +# Process immediate subdirectories from both DIR1 and DIR2. +for d in "$DIR1"/* "$DIR2"/*; do + if [ -d "$d" ]; then + base=$(basename "$d") + cleaned=$(clean_name "$base") + added=false + # Compare against each existing group's representative. + for i in "${!group_rep[@]}"; do + rep="${group_rep[$i]}" + sim=$(compute_similarity "$rep" "$cleaned") + if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then + groups["$i"]+=$'\n'"$d" + added=true + break + fi + done + # If no similar group found, create a new group. + if [ "$added" = false ]; then + new_index=${#group_rep[@]} + group_rep+=("$cleaned") + groups["$new_index"]="$d" + fi + fi +done + +echo "=== Automatic Removal Based on Undesirable Words ===" +# For each duplicate group, automatically remove directories whose original names contain +# an undesirable word if at least one alternative in the group does not. +for key in "${!groups[@]}"; do + IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true + if [ "${#paths[@]}" -gt 1 ]; then + for path in "${paths[@]}"; do + base=$(basename "$path") + for word in "${words[@]}"; do + if echo "$base" | grep -qi "$word"; then + removal_candidate=false + for other in "${paths[@]}"; do + if [ "$other" != "$path" ]; then + other_base=$(basename "$other") + if ! echo "$other_base" | grep -qi "$word"; then + removal_candidate=true + break + fi + fi + done + if $removal_candidate; then + echo "Candidate for auto-removal: $path (matches word: '$word')" + if $DRY_RUN; then + echo "Dry-run: would remove $path" + else + rm -rf "$path" + echo "Removed $path" + fi + # Update the group by removing the candidate. + new_group=() + for p in "${paths[@]}"; do + if [ "$p" != "$path" ]; then + new_group+=("$p") + fi + done + groups["$key"]=$(printf "%s\n" "${new_group[@]}") + # Refresh the paths array. + IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true + break 2 + fi + fi + done + done + fi +done + +echo "=== Auto-Removing Remaining Duplicates (Keeping the First Entry) ===" +# For any remaining duplicate groups, automatically remove all but the first directory. +for key in "${!groups[@]}"; do + IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true + # Filter out directories that no longer exist. + existing=() + for path in "${paths[@]}"; do + if [ -d "$path" ]; then + existing+=("$path") + fi + done + if [ "${#existing[@]}" -gt 1 ]; then + echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}):" + echo "Keeping: ${existing[0]}" + for (( i=1; i<${#existing[@]}; i++ )); do + echo "Auto-removing: ${existing[$i]}" + if $DRY_RUN; then + echo "Dry-run: would remove ${existing[$i]}" + else + rm -rf "${existing[$i]}" + echo "Removed ${existing[$i]}" + fi + done + fi +done + +echo "Script completed." diff --git a/words b/words new file mode 100644 index 0000000..4f15897 --- /dev/null +++ b/words @@ -0,0 +1,5 @@ +dv +1080 +finnish +norwegian +swedish