Hi,
ich bin neu hier.
Ich habe mich mal mit ChatGPT versucht und ein Program erstellt, welches ein Verzeichnis mit den Unterordnen einliest und alle PDF Dateien vergleicht und mir
eine Liste mit Dateien ausgibt, welche Duplikate sind.
Hier ist der Code:
import os
import PyPDF2
import pandas as pd
from tkinter import filedialog, messagebox, Tk, Button, Label, ttk, DoubleVar
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
text = „“
with open(file_path, ‚rb‘) as file:
reader = PyPDF2.PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
try:
text += page.extract_text()
except Exception as e:
print(f"Fehler beim Extrahieren von Text auf Seite {page_num+1} von {file_path}: {e}")
return text
Function to count all PDF files in a directory and its subdirectories
def count_pdfs_in_directory(directory):
count = 0
for root, _, files in os.walk(directory):
count += sum(1 for filename in files if filename.endswith(„.pdf“))
return count
Function to load all PDF files from a directory and its subdirectories
def load_pdfs_from_directory(directory, progress_var, total_files):
pdf_texts = {}
processed_files = 0
for root, _, files in os.walk(directory):
for filename in files:
if filename.endswith(„.pdf“):
file_path = os.path.join(root, filename)
text = extract_text_from_pdf(file_path)
if text: # Nur hinzufügen, wenn der Text erfolgreich extrahiert wurde
relative_path = os.path.relpath(file_path, directory)
pdf_texts[relative_path] = text
processed_files += 1
progress_var.set((processed_files / total_files) * 100)
progress_bar.update()
return pdf_texts
Function to compare the PDF files
def compare_pdfs(pdf_texts):
filenames = list(pdf_texts.keys())
texts = list(pdf_texts.values())
vectorizer = TfidfVectorizer().fit_transform(texts)
vectors = vectorizer.toarray()
cosine_matrix = cosine_similarity(vectors)
similarity_df = pd.DataFrame(cosine_matrix, index=filenames, columns=filenames)
return similarity_df
Function to handle the directory selection and PDF comparison
def select_directory():
directory = filedialog.askdirectory()
if directory:
total_files = count_pdfs_in_directory(directory)
# Update the label to show the number of files
files_label.config(text=f"Anzahl der zu vergleichenden Dateien: {total_files}")
progress_var.set(0)
progress_bar.update()
if total_files > 0:
pdf_texts = load_pdfs_from_directory(directory, progress_var, total_files)
similarity_df = compare_pdfs(pdf_texts)
# Filter out comparisons where similarity is not greater than 0.9999 or where files are compared with themselves
filtered_df = similarity_df.where(similarity_df > 0.9999)
filtered_df = filtered_df.dropna(how='all', axis=0).dropna(how='all', axis=1)
# Count the number of files with at least two similarities > 0.9999
file_similarity_count = (filtered_df > 0.9999).sum(axis=1)
files_with_high_similarity = file_similarity_count[file_similarity_count >= 2].index.tolist()
num_filtered_files = len(files_with_high_similarity)
# Update the label to show the number of filtered files
filtered_files_label.config(text=f"Anzahl der Dateien mit mindestens zwei Ähnlichkeiten > 0.9999: {num_filtered_files}")
# Save the filtered similarity overview as an Excel file
excel_path = os.path.join(directory, "pdf_similarity_overview_filtered.xlsx")
filtered_df.to_excel(excel_path, index=True)
# Save the list of files with high similarity to a TXT file
txt_file_path = os.path.join(directory, "files_with_high_similarity.txt")
with open(txt_file_path, 'w') as txt_file:
for file in files_with_high_similarity:
txt_file.write(f"{file}\n")
# Load the Excel file to apply formatting
workbook = load_workbook(excel_path)
sheet = workbook.active
# Define the fill for the cells with value higher than 0.9999
fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
# Apply the fill to the appropriate cells
for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, min_col=2, max_col=sheet.max_column):
for cell in row:
if cell.value is not None and cell.value > 0.9999:
cell.fill = fill
# Adjust column widths to fit the content
for column in sheet.columns:
max_length = 0
column = list(column)
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
adjusted_width = (max_length + 2)
sheet.column_dimensions[column[0].column_letter].width = adjusted_width
# Save the workbook with the applied formatting
workbook.save(excel_path)
messagebox.showinfo("Fertig", "Der Vergleich ist abgeschlossen und die Ergebnisse wurden gespeichert.")
else:
messagebox.showwarning("Warnung", "Keine PDF-Dateien im Verzeichnis gefunden.")
Create the main window
root = Tk()
root.title(„PDF Vergleich“)
root.geometry(„500x250“)
Create and place the label for file count
files_label = Label(root, text=„Anzahl der zu vergleichenden Dateien: 0“)
files_label.pack(pady=10)
Create and place the label for filtered file count
filtered_files_label = Label(root, text=„Anzahl der Dateien mit mindestens zwei Ähnlichkeiten > 0.9999: 0“)
filtered_files_label.pack(pady=10)
Create and place the progress bar
progress_var = DoubleVar()
progress_bar = ttk.Progressbar(root, orient=„horizontal“, length=400, mode=„determinate“, variable=progress_var)
progress_bar.pack(pady=10)
Create and place the button
button = Button(root, text=„Verzeichnis auswählen und PDFs vergleichen“, command=select_directory)
button.pack(pady=20)
Run the application
root.mainloop()
Grüße Gerdie