name change and multiprocessing the fuzze operation

This commit is contained in:
masterdraco 2025-02-25 21:53:48 +01:00
parent 9e6961c406
commit 56d41fb487

View File

@ -9,7 +9,8 @@
# the directory is removed outright. # the directory is removed outright.
# 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed) # 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
# and then grouped by fuzzy similarity using a configurable threshold. # and then grouped by fuzzy similarity using a configurable threshold.
# 4. Within each group, if at least one directory contains "2160p" and one contains "1080p", # The fuzzy similarity process is optimized with a multiprocessing helper.
# 4. Within each group, if one directorys name contains "2160p" and another contains "1080p",
# the 1080p directory(ies) are removed (or flagged in dry-run mode). # the 1080p directory(ies) are removed (or flagged in dry-run mode).
# 5. For any remaining duplicate groups, the user is prompted to select a directory to remove. # 5. For any remaining duplicate groups, the user is prompted to select a directory to remove.
# 6. A --dry-run mode is available to preview removals without actually deleting any directories. # 6. A --dry-run mode is available to preview removals without actually deleting any directories.
@ -97,15 +98,27 @@ done
# Function: Normalize and clean a directory name. # Function: Normalize and clean a directory name.
clean_name() { clean_name() {
local name="$1" local name="$1"
# Normalize: convert to lowercase, remove punctuation, and trim extra whitespace.
echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs
} }
# Function: Compute fuzzy similarity between two names using Python's difflib. # Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing.
compute_similarity() { compute_similarities() {
local name1="$1" local target="$1"
local name2="$2" shift
python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2" # Pass target and the list of representatives as command-line arguments to Python.
python3 - "$target" "$@" <<EOF
import sys
from difflib import SequenceMatcher
from multiprocessing import Pool
target = sys.argv[1]
reps = sys.argv[2:]
def similarity(rep):
return SequenceMatcher(None, target, rep).ratio()
with Pool() as pool:
results = pool.map(similarity, reps)
print(" ".join(map(str, results)))
EOF
} }
echo "=== Grouping Remaining Directories by Fuzzy Similarity ===" echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
@ -118,17 +131,18 @@ for d in "${filtered_dirs[@]}"; do
base=$(basename "$d") base=$(basename "$d")
cleaned=$(clean_name "$base") cleaned=$(clean_name "$base")
added=false added=false
# Compare with each existing group's representative. if [ "${#group_rep[@]}" -gt 0 ]; then
for i in "${!group_rep[@]}"; do # Compute similarities between the cleaned name and all group representatives concurrently.
rep="${group_rep[$i]}" similarities=$(compute_similarities "$cleaned" "${group_rep[@]}")
sim=$(compute_similarity "$rep" "$cleaned") read -r -a sims <<< "$similarities"
if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then for i in "${!sims[@]}"; do
groups["$i"]+=$'\n'"$d" if (( $(echo "${sims[$i]} >= $SIMILARITY_THRESHOLD" | bc -l) )); then
added=true groups["$i"]+=$'\n'"$d"
break added=true
fi break
done fi
# If not added to an existing group, create a new group. done
fi
if [ "$added" = false ]; then if [ "$added" = false ]; then
new_index=${#group_rep[@]} new_index=${#group_rep[@]}
group_rep+=("$cleaned") group_rep+=("$cleaned")
@ -153,7 +167,7 @@ for key in "${!groups[@]}"; do
fi fi
done done
if $has_2160p && $has_1080p; then if $has_2160p && $has_1080p; then
echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories." echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
new_group=() new_group=()
for path in "${paths[@]}"; do for path in "${paths[@]}"; do
base=$(basename "$path") base=$(basename "$path")
@ -185,7 +199,7 @@ for key in "${!groups[@]}"; do
fi fi
done done
if [ "${#existing[@]}" -gt 1 ]; then if [ "${#existing[@]}" -gt 1 ]; then
echo "Duplicate group (representative cleaned name: ${group_rep[$key]:-unknown}):" echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):"
i=1 i=1
for p in "${existing[@]}"; do for p in "${existing[@]}"; do
echo " [$i] $p" echo " [$i] $p"