From 56d41fb487976828c84efc8276bd6a0acd49c83c Mon Sep 17 00:00:00 2001
From: masterdraco <root@torrent-mover.powerdata.dk>
Date: Tue, 25 Feb 2025 21:53:48 +0100
Subject: [PATCH] name change and multiprocessing the fuzze operation

---
 compare => compare_dirs.sh | 54 ++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 20 deletions(-)
 rename compare => compare_dirs.sh (80%)

diff --git a/compare b/compare_dirs.sh
similarity index 80%
rename from compare
rename to compare_dirs.sh
index f2a8677..948e8cf 100755
--- a/compare
+++ b/compare_dirs.sh
@@ -9,7 +9,8 @@
 #      the directory is removed outright.
 #   3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
 #      and then grouped by fuzzy similarity using a configurable threshold.
-#   4. Within each group, if at least one directory contains "2160p" and one contains "1080p",
+#      The fuzzy similarity process is optimized with a multiprocessing helper.
+#   4. Within each group, if one directory’s name contains "2160p" and another contains "1080p",
 #      the 1080p directory(ies) are removed (or flagged in dry-run mode).
 #   5. For any remaining duplicate groups, the user is prompted to select a directory to remove.
 #   6. A --dry-run mode is available to preview removals without actually deleting any directories.
@@ -97,15 +98,27 @@ done
 # Function: Normalize and clean a directory name.
 clean_name() {
     local name="$1"
-    # Normalize: convert to lowercase, remove punctuation, and trim extra whitespace.
     echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs
 }
 
-# Function: Compute fuzzy similarity between two names using Python's difflib.
-compute_similarity() {
-    local name1="$1"
-    local name2="$2"
-    python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2"
+# Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing.
+compute_similarities() {
+    local target="$1"
+    shift
+    # Pass target and the list of representatives as command-line arguments to Python.
+    python3 - "$target" "$@" <<EOF
+import sys
+from difflib import SequenceMatcher
+from multiprocessing import Pool
+
+target = sys.argv[1]
+reps = sys.argv[2:]
+def similarity(rep):
+    return SequenceMatcher(None, target, rep).ratio()
+with Pool() as pool:
+    results = pool.map(similarity, reps)
+print(" ".join(map(str, results)))
+EOF
 }
 
 echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
@@ -118,17 +131,18 @@ for d in "${filtered_dirs[@]}"; do
     base=$(basename "$d")
     cleaned=$(clean_name "$base")
     added=false
-    # Compare with each existing group's representative.
-    for i in "${!group_rep[@]}"; do
-        rep="${group_rep[$i]}"
-        sim=$(compute_similarity "$rep" "$cleaned")
-        if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then
-            groups["$i"]+=$'\n'"$d"
-            added=true
-            break
-        fi
-    done
-    # If not added to an existing group, create a new group.
+    if [ "${#group_rep[@]}" -gt 0 ]; then
+        # Compute similarities between the cleaned name and all group representatives concurrently.
+        similarities=$(compute_similarities "$cleaned" "${group_rep[@]}")
+        read -r -a sims <<< "$similarities"
+        for i in "${!sims[@]}"; do
+            if (( $(echo "${sims[$i]} >= $SIMILARITY_THRESHOLD" | bc -l) )); then
+                groups["$i"]+=$'\n'"$d"
+                added=true
+                break
+            fi
+        done
+    fi
     if [ "$added" = false ]; then
         new_index=${#group_rep[@]}
         group_rep+=("$cleaned")
@@ -153,7 +167,7 @@ for key in "${!groups[@]}"; do
         fi
     done
     if $has_2160p && $has_1080p; then
-        echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
+        echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
         new_group=()
         for path in "${paths[@]}"; do
             base=$(basename "$path")
@@ -185,7 +199,7 @@ for key in "${!groups[@]}"; do
         fi
     done
     if [ "${#existing[@]}" -gt 1 ]; then
-        echo "Duplicate group (representative cleaned name: ${group_rep[$key]:-unknown}):"
+        echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):"
         i=1
         for p in "${existing[@]}"; do
             echo "  [$i] $p"