Find text from image OCR - Python

find text from an image tessaract,

import os
import re
from PIL import Image
import pytesseract
import cv2
import numpy as np

# ✅ Path to Tesseract (Windows)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# ✅ Folder containing GoPro images
folder = "./images"

# ✅ List all image files (filter common extensions)
valid_exts = (".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG")
files = [f for f in os.listdir(folder) if f.endswith(valid_exts)]

# --- Loop over each file ---
for i, file in enumerate(files, start=1):
    path = os.path.join(folder, file)
    print(f"\n📸 Processing file {i}/{len(files)}: {file}")

    # --- STEP 1: Clean metadata using Pillow ---
    try:
        with Image.open(path) as img:
            img = img.convert("RGB")  # Drop nonstandard metadata
            clean_path = os.path.join(folder, f"clean_{i}.jpg")
            img.save(clean_path, "JPEG", quality=100)
    except Exception as e:
        print(f"❌ Could not open {file}: {e}")
        continue

    # --- STEP 2: Load with OpenCV ---
    img = cv2.imread(clean_path)
    if img is None:
        print(f"⚠️ Skipping {file} (could not read with OpenCV)")
        continue

    # --- STEP 3: Preprocessing ---
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    gray = cv2.medianBlur(gray, 3)

    # --- STEP 4: Deskew (if text is tilted) ---
    coords = np.column_stack(np.where(gray > 0))
    if len(coords) > 0:
        angle = cv2.minAreaRect(coords)[-1]
        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle

        (h, w) = gray.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        gray = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

    # --- STEP 5: Resize to improve OCR ---
    scale = 2.0
    resized = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)

    # --- STEP 6: Save preprocessed image (optional for debugging) ---
    processed_path = os.path.join(folder, f"processed_{i}.jpg")
    cv2.imwrite(processed_path, resized)

    # --- STEP 7: OCR with tuned config ---
    config = "--psm 6"
    text = pytesseract.image_to_string(resized, config=config, lang="eng")

    # --- STEP 8: Extract dot-containing words (like URLs, filenames, etc.) ---
    pattern = r'\b\w+\.\w{2,}\b'
    matches = re.findall(pattern, text)

    if matches:
        print("✅ Matches found:")
        for match in matches:
            print("   ", match)
    else:
        print("⚠️ No matches found.")

print("\n✅ Processing complete.")

links

social