From 56d41fb487976828c84efc8276bd6a0acd49c83c Mon Sep 17 00:00:00 2001 From: masterdraco Date: Tue, 25 Feb 2025 21:53:48 +0100 Subject: [PATCH] name change and multiprocessing the fuzze operation --- compare => compare_dirs.sh | 54 ++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 20 deletions(-) rename compare => compare_dirs.sh (80%) diff --git a/compare b/compare_dirs.sh similarity index 80% rename from compare rename to compare_dirs.sh index f2a8677..948e8cf 100755 --- a/compare +++ b/compare_dirs.sh @@ -9,7 +9,8 @@ # the directory is removed outright. # 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed) # and then grouped by fuzzy similarity using a configurable threshold. -# 4. Within each group, if at least one directory contains "2160p" and one contains "1080p", +# The fuzzy similarity process is optimized with a multiprocessing helper. +# 4. Within each group, if one directory’s name contains "2160p" and another contains "1080p", # the 1080p directory(ies) are removed (or flagged in dry-run mode). # 5. For any remaining duplicate groups, the user is prompted to select a directory to remove. # 6. A --dry-run mode is available to preview removals without actually deleting any directories. @@ -97,15 +98,27 @@ done # Function: Normalize and clean a directory name. clean_name() { local name="$1" - # Normalize: convert to lowercase, remove punctuation, and trim extra whitespace. echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs } -# Function: Compute fuzzy similarity between two names using Python's difflib. -compute_similarity() { - local name1="$1" - local name2="$2" - python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2" +# Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing. +compute_similarities() { + local target="$1" + shift + # Pass target and the list of representatives as command-line arguments to Python. + python3 - "$target" "$@" <= $SIMILARITY_THRESHOLD" | bc -l) )); then - groups["$i"]+=$'\n'"$d" - added=true - break - fi - done - # If not added to an existing group, create a new group. + if [ "${#group_rep[@]}" -gt 0 ]; then + # Compute similarities between the cleaned name and all group representatives concurrently. + similarities=$(compute_similarities "$cleaned" "${group_rep[@]}") + read -r -a sims <<< "$similarities" + for i in "${!sims[@]}"; do + if (( $(echo "${sims[$i]} >= $SIMILARITY_THRESHOLD" | bc -l) )); then + groups["$i"]+=$'\n'"$d" + added=true + break + fi + done + fi if [ "$added" = false ]; then new_index=${#group_rep[@]} group_rep+=("$cleaned") @@ -153,7 +167,7 @@ for key in "${!groups[@]}"; do fi done if $has_2160p && $has_1080p; then - echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories." + echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories." new_group=() for path in "${paths[@]}"; do base=$(basename "$path") @@ -185,7 +199,7 @@ for key in "${!groups[@]}"; do fi done if [ "${#existing[@]}" -gt 1 ]; then - echo "Duplicate group (representative cleaned name: ${group_rep[$key]:-unknown}):" + echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):" i=1 for p in "${existing[@]}"; do echo " [$i] $p"