Duplikate die nicht als Duplikate erkannt werden

anon58924890 · 18. Januar 2024 um 15:52

Wenn ich alle paar Monate meine Kontoauszüge und Telefonrechnungen und so einlese kommt es wohl vor das ich Dokumente doppelt einlesen lasse ohne es erst zu merken.

Paperless hat deswegen schon mehrfach Identische Dokumente mit dem selben Dateinamen eingelesen und verarbeitet ohne einen Fehler auszuspucken bzgl. Duplikate.

Beim Verarbeiten von E-Mails mit Rechnungen ist das wohl auch schon vorgekommen.

Kennt ihr das Problem auch ?
Hab die bisher mühevoll verglichen und dann wieder gelöscht.
Ziel wäre aber das dies nicht vorkommt.

Danke

Gerdie42 · 4. Juli 2024 um 18:21

Hi,

ich bin neu hier.

Ich habe mich mal mit ChatGPT versucht und ein Program erstellt, welches ein Verzeichnis mit den Unterordnen einliest und alle PDF Dateien vergleicht und mir
eine Liste mit Dateien ausgibt, welche Duplikate sind.

Hier ist der Code:
import os
import PyPDF2
import pandas as pd
from tkinter import filedialog, messagebox, Tk, Button, Label, ttk, DoubleVar
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from openpyxl import load_workbook
from openpyxl.styles import PatternFill

Function to extract text from a PDF file

def extract_text_from_pdf(file_path):
text = „“
with open(file_path, ‚rb‘) as file:
reader = PyPDF2.PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
try:
text += page.extract_text()
except Exception as e:
print(f"Fehler beim Extrahieren von Text auf Seite {page_num+1} von {file_path}: {e}")
return text

Function to count all PDF files in a directory and its subdirectories

def count_pdfs_in_directory(directory):
count = 0
for root, _, files in os.walk(directory):
count += sum(1 for filename in files if filename.endswith(„.pdf“))
return count

Function to load all PDF files from a directory and its subdirectories

def load_pdfs_from_directory(directory, progress_var, total_files):
pdf_texts = {}
processed_files = 0
for root, _, files in os.walk(directory):
for filename in files:
if filename.endswith(„.pdf“):
file_path = os.path.join(root, filename)
text = extract_text_from_pdf(file_path)
if text: # Nur hinzufügen, wenn der Text erfolgreich extrahiert wurde
relative_path = os.path.relpath(file_path, directory)
pdf_texts[relative_path] = text
processed_files += 1
progress_var.set((processed_files / total_files) * 100)
progress_bar.update()
return pdf_texts

Function to compare the PDF files

def compare_pdfs(pdf_texts):
filenames = list(pdf_texts.keys())
texts = list(pdf_texts.values())

vectorizer = TfidfVectorizer().fit_transform(texts)
vectors = vectorizer.toarray()

cosine_matrix = cosine_similarity(vectors)
similarity_df = pd.DataFrame(cosine_matrix, index=filenames, columns=filenames)
return similarity_df

Function to handle the directory selection and PDF comparison

def select_directory():
directory = filedialog.askdirectory()
if directory:
total_files = count_pdfs_in_directory(directory)

    # Update the label to show the number of files
    files_label.config(text=f"Anzahl der zu vergleichenden Dateien: {total_files}")

    progress_var.set(0)
    progress_bar.update()

    if total_files > 0:
        pdf_texts = load_pdfs_from_directory(directory, progress_var, total_files)
        similarity_df = compare_pdfs(pdf_texts)

        # Filter out comparisons where similarity is not greater than 0.9999 or where files are compared with themselves
        filtered_df = similarity_df.where(similarity_df > 0.9999)
        filtered_df = filtered_df.dropna(how='all', axis=0).dropna(how='all', axis=1)

        # Count the number of files with at least two similarities > 0.9999
        file_similarity_count = (filtered_df > 0.9999).sum(axis=1)
        files_with_high_similarity = file_similarity_count[file_similarity_count >= 2].index.tolist()
        num_filtered_files = len(files_with_high_similarity)

        # Update the label to show the number of filtered files
        filtered_files_label.config(text=f"Anzahl der Dateien mit mindestens zwei Ähnlichkeiten > 0.9999: {num_filtered_files}")

        # Save the filtered similarity overview as an Excel file
        excel_path = os.path.join(directory, "pdf_similarity_overview_filtered.xlsx")
        filtered_df.to_excel(excel_path, index=True)

        # Save the list of files with high similarity to a TXT file
        txt_file_path = os.path.join(directory, "files_with_high_similarity.txt")
        with open(txt_file_path, 'w') as txt_file:
            for file in files_with_high_similarity:
                txt_file.write(f"{file}\n")

        # Load the Excel file to apply formatting
        workbook = load_workbook(excel_path)
        sheet = workbook.active

        # Define the fill for the cells with value higher than 0.9999
        fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")

        # Apply the fill to the appropriate cells
        for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, min_col=2, max_col=sheet.max_column):
            for cell in row:
                if cell.value is not None and cell.value > 0.9999:
                    cell.fill = fill

        # Adjust column widths to fit the content
        for column in sheet.columns:
            max_length = 0
            column = list(column)
            for cell in column:
                try:
                    if len(str(cell.value)) > max_length:
                        max_length = len(str(cell.value))
                except:
                    pass
            adjusted_width = (max_length + 2)
            sheet.column_dimensions[column[0].column_letter].width = adjusted_width

        # Save the workbook with the applied formatting
        workbook.save(excel_path)

        messagebox.showinfo("Fertig", "Der Vergleich ist abgeschlossen und die Ergebnisse wurden gespeichert.")
    else:
        messagebox.showwarning("Warnung", "Keine PDF-Dateien im Verzeichnis gefunden.")

Create the main window

root = Tk()
root.title(„PDF Vergleich“)
root.geometry(„500x250“)

Create and place the label for file count

files_label = Label(root, text=„Anzahl der zu vergleichenden Dateien: 0“)
files_label.pack(pady=10)

Create and place the label for filtered file count

filtered_files_label = Label(root, text=„Anzahl der Dateien mit mindestens zwei Ähnlichkeiten > 0.9999: 0“)
filtered_files_label.pack(pady=10)

Create and place the progress bar

progress_var = DoubleVar()
progress_bar = ttk.Progressbar(root, orient=„horizontal“, length=400, mode=„determinate“, variable=progress_var)
progress_bar.pack(pady=10)

Create and place the button

button = Button(root, text=„Verzeichnis auswählen und PDFs vergleichen“, command=select_directory)
button.pack(pady=20)

Run the application

root.mainloop()

Grüße Gerdie