name change and multiprocessing the fuzze operation
This commit is contained in:
parent
9e6961c406
commit
56d41fb487
@ -9,7 +9,8 @@
|
||||
# the directory is removed outright.
|
||||
# 3. The remaining directories are “cleaned” (converted to lowercase, punctuation removed)
|
||||
# and then grouped by fuzzy similarity using a configurable threshold.
|
||||
# 4. Within each group, if at least one directory contains "2160p" and one contains "1080p",
|
||||
# The fuzzy similarity process is optimized with a multiprocessing helper.
|
||||
# 4. Within each group, if one directory’s name contains "2160p" and another contains "1080p",
|
||||
# the 1080p directory(ies) are removed (or flagged in dry-run mode).
|
||||
# 5. For any remaining duplicate groups, the user is prompted to select a directory to remove.
|
||||
# 6. A --dry-run mode is available to preview removals without actually deleting any directories.
|
||||
@ -97,15 +98,27 @@ done
|
||||
# Function: Normalize and clean a directory name.
|
||||
clean_name() {
|
||||
local name="$1"
|
||||
# Normalize: convert to lowercase, remove punctuation, and trim extra whitespace.
|
||||
echo "$name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g' | xargs
|
||||
}
|
||||
|
||||
# Function: Compute fuzzy similarity between two names using Python's difflib.
|
||||
compute_similarity() {
|
||||
local name1="$1"
|
||||
local name2="$2"
|
||||
python3 -c "import sys; from difflib import SequenceMatcher; print(SequenceMatcher(None, sys.argv[1], sys.argv[2]).ratio())" "$name1" "$name2"
|
||||
# Function: Compute fuzzy similarities between a target and a list of strings using multiprocessing.
|
||||
compute_similarities() {
|
||||
local target="$1"
|
||||
shift
|
||||
# Pass target and the list of representatives as command-line arguments to Python.
|
||||
python3 - "$target" "$@" <<EOF
|
||||
import sys
|
||||
from difflib import SequenceMatcher
|
||||
from multiprocessing import Pool
|
||||
|
||||
target = sys.argv[1]
|
||||
reps = sys.argv[2:]
|
||||
def similarity(rep):
|
||||
return SequenceMatcher(None, target, rep).ratio()
|
||||
with Pool() as pool:
|
||||
results = pool.map(similarity, reps)
|
||||
print(" ".join(map(str, results)))
|
||||
EOF
|
||||
}
|
||||
|
||||
echo "=== Grouping Remaining Directories by Fuzzy Similarity ==="
|
||||
@ -118,17 +131,18 @@ for d in "${filtered_dirs[@]}"; do
|
||||
base=$(basename "$d")
|
||||
cleaned=$(clean_name "$base")
|
||||
added=false
|
||||
# Compare with each existing group's representative.
|
||||
for i in "${!group_rep[@]}"; do
|
||||
rep="${group_rep[$i]}"
|
||||
sim=$(compute_similarity "$rep" "$cleaned")
|
||||
if (( $(echo "$sim >= $SIMILARITY_THRESHOLD" | bc -l) )); then
|
||||
if [ "${#group_rep[@]}" -gt 0 ]; then
|
||||
# Compute similarities between the cleaned name and all group representatives concurrently.
|
||||
similarities=$(compute_similarities "$cleaned" "${group_rep[@]}")
|
||||
read -r -a sims <<< "$similarities"
|
||||
for i in "${!sims[@]}"; do
|
||||
if (( $(echo "${sims[$i]} >= $SIMILARITY_THRESHOLD" | bc -l) )); then
|
||||
groups["$i"]+=$'\n'"$d"
|
||||
added=true
|
||||
break
|
||||
fi
|
||||
done
|
||||
# If not added to an existing group, create a new group.
|
||||
fi
|
||||
if [ "$added" = false ]; then
|
||||
new_index=${#group_rep[@]}
|
||||
group_rep+=("$cleaned")
|
||||
@ -153,7 +167,7 @@ for key in "${!groups[@]}"; do
|
||||
fi
|
||||
done
|
||||
if $has_2160p && $has_1080p; then
|
||||
echo "Group (representative cleaned name: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
|
||||
echo "Group (representative: ${group_rep[$key]:-unknown}) has both 1080p and 2160p directories."
|
||||
new_group=()
|
||||
for path in "${paths[@]}"; do
|
||||
base=$(basename "$path")
|
||||
@ -185,7 +199,7 @@ for key in "${!groups[@]}"; do
|
||||
fi
|
||||
done
|
||||
if [ "${#existing[@]}" -gt 1 ]; then
|
||||
echo "Duplicate group (representative cleaned name: ${group_rep[$key]:-unknown}):"
|
||||
echo "Duplicate group (representative: ${group_rep[$key]:-unknown}):"
|
||||
i=1
|
||||
for p in "${existing[@]}"; do
|
||||
echo " [$i] $p"
|
Loading…
x
Reference in New Issue
Block a user