first release

2025-02-25 14:24:50 +01:00
parent 5a7fd15e08
commit 430076ad00
2 changed files with 195 additions and 0 deletions
--- a/190
+++ b/190
@@ -0,0 +1,190 @@
+#!/bin/bash
+# compare_dirs.sh
+#
+# Usage: ./compare_dirs.sh [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>
+#
+# This script:
+#   - Scans immediate subdirectories in <dir1> and <dir2>
+#   - Normalizes and “cleans” their names by removing punctuation, converting to lower-case,
+#     trimming whitespace, and removing any words listed in <words_file>
+#   - Groups directories using a fuzzy matching algorithm (with a configurable threshold)
+#   - Automatically removes a directory from a duplicate group if its original name contains an
+#     undesirable word while an alternative does not
+#   - Then automatically removes any remaining duplicates in each group (keeping the first directory)
+#   - Supports a --dry-run mode that shows actions without deleting directories
+
+set -euo pipefail
+
+# Default options
+DRY_RUN=false
+SIMILARITY_THRESHOLD=0.8
+
+# Process command-line flags
+while [[ "$1" == --* ]]; do
+    case "$1" in
+        --dry-run)
+            DRY_RUN=true
+            shift
+            ;;
+        --threshold)
+            SIMILARITY_THRESHOLD="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 [--dry-run] [--threshold <threshold>] <dir1> <dir2> <words_file>"
+    exit 1
+fi
+
+DIR1="$1"
+DIR2="$2"
+WORDS_FILE="$3"
+
+# Verify input paths
+if [ ! -d "$DIR1" ]; then
+    echo "Error: Directory '$DIR1' not found."
+    exit 1
+fi
+if [ ! -d "$DIR2" ]; then
+    echo "Error: Directory '$DIR2' not found."
+    exit 1
+fi
+if [ ! -f "$WORDS_FILE" ]; then
+    echo "Error: Words file '$WORDS_FILE' not found."
+    exit 1
+fi
+
+# Read undesirable words (one per line) into an array, filtering out blank lines.
+mapfile -t words < <(grep -v '^[[:space:]]*$' "$WORDS_FILE")
+
+# Function: Normalize and clean a directory name.
+clean_name() {
+    local name="$1"
+    # Normalize: convert to lower-case, remove punctuation, and trim extra whitespace.
+    local normalized
+    normalized=$(echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs)
+    local cleaned="$normalized"
+    for word in "${words[@]}"; do
+        # Remove the word (case-insensitive).
+        cleaned=$(echo "$cleaned" | sed "s/$(echo "$word" | tr '[:upper:]' '[:lower:]')//Ig")
+    done
+    # Trim again in case extra spaces were left.
+    cleaned=$(echo "$cleaned" | xargs)
+    echo "$cleaned"
+}
+
+# Function: Compute fuzzy similarity between two names using Python's difflib.
+compute_similarity() {
+    local name1="$1"
+    local name2="$2"
+    python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2"
+}
+
+# Initialize grouping arrays explicitly.
+declare -a group_rep=()   # Array for representative cleaned names.
+declare -A groups=()      # Associative array: groups[i] holds newline-separated directory paths.
+
+# Process immediate subdirectories from both DIR1 and DIR2.
+for d in "$DIR1"/* "$DIR2"/*; do
+    if [ -d "$d" ]; then
+        base=$(basename "$d")
+        cleaned=$(clean_name "$base")
+        added=false
+        # Compare against each existing group's representative.
+        for i in "${!group_rep[@]}"; do
+            rep="${group_rep[$i]}"
+            sim=$(compute_similarity "$rep" "$cleaned")
+            if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then
+                groups["$i"]+=$'\n'"$d"
+                added=true
+                break
+            fi
+        done
+        # If no similar group found, create a new group.
+        if [ "$added" = false ]; then
+            new_index=${#group_rep[@]}
+            group_rep+=("$cleaned")
+            groups["$new_index"]="$d"
+        fi
+    fi
+done
+
+echo "=== Automatic Removal Based on Undesirable Words ==="
+# For each duplicate group, automatically remove directories whose original names contain
+# an undesirable word if at least one alternative in the group does not.
+for key in "${!groups[@]}"; do
+    IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
+    if [ "${#paths[@]}" -gt 1 ]; then
+        for path in "${paths[@]}"; do
+            base=$(basename "$path")
+            for word in "${words[@]}"; do
+                if echo "$base" | grep -qi "$word"; then
+                    removal_candidate=false
+                    for other in "${paths[@]}"; do
+                        if [ "$other" != "$path" ]; then
+                            other_base=$(basename "$other")
+                            if ! echo "$other_base" | grep -qi "$word"; then
+                                removal_candidate=true
+                                break
+                            fi
+                        fi
+                    done
+                    if $removal_candidate; then
+                        echo "Candidate for auto-removal: $path (matches word: '$word')"
+                        if $DRY_RUN; then
+                            echo "Dry-run: would remove $path"
+                        else
+                            rm -rf "$path"
+                            echo "Removed $path"
+                        fi
+                        # Update the group by removing the candidate.
+                        new_group=()
+                        for p in "${paths[@]}"; do
+                            if [ "$p" != "$path" ]; then
+                                new_group+=("$p")
+                            fi
+                        done
+                        groups["$key"]=$(printf "%s\n" "${new_group[@]}")
+                        # Refresh the paths array.
+                        IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
+                        break 2
+                    fi
+                fi
+            done
+        done
+    fi
+done
+
+echo "=== Auto-Removing Remaining Duplicates (Keeping the First Entry) ==="
+# For any remaining duplicate groups, automatically remove all but the first directory.
+for key in "${!groups[@]}"; do
+    IFS=$'\n' read -r -a paths <<< "${groups[$key]}" || true
+    # Filter out directories that no longer exist.
+    existing=()
+    for path in "${paths[@]}"; do
+        if [ -d "$path" ]; then
+            existing+=("$path")
+        fi
+    done
+    if [ "${#existing[@]}" -gt 1 ]; then
+        echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}):"
+        echo "Keeping: ${existing[0]}"
+        for (( i=1; i<${#existing[@]}; i++ )); do
+            echo "Auto-removing: ${existing[$i]}"
+            if $DRY_RUN; then
+                echo "Dry-run: would remove ${existing[$i]}"
+            else
+                rm -rf "${existing[$i]}"
+                echo "Removed ${existing[$i]}"
+            fi
+        done
+    fi
+done
+
+echo "Script completed."
--- a/5
+++ b/5
@@ -0,0 +1,5 @@
+dv
+1080
+finnish
+norwegian
+swedish