#!/usr/bin/env python3
"""
Crawl all products from dealplustech.co.th and create markdown files
"""
import requests
from bs4 import BeautifulSoup
import json
import os
import time
from urllib.parse import urljoin, unquote
import re

BASE_URL = "https://www.dealplustech.co.th"
OUTPUT_DIR = "/Users/kunthawatgreethong/Gitea/dealplustech/src/content/products"
IMAGE_DIR = "/Users/kunthawatgreethong/Gitea/dealplustech/public/images/2021/03"

def get_soup(url):
    """Get BeautifulSoup object"""
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def download_image(url, filename):
    """Download image from URL"""
    try:
        if os.path.exists(os.path.join(IMAGE_DIR, filename)):
            return filename
        
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=30, stream=True)
        response.raise_for_status()
        
        filepath = os.path.join(IMAGE_DIR, filename)
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(8192):
                f.write(chunk)
        
        return filename
    except Exception as e:
        print(f"  Failed to download {filename}: {e}")
        return None

def create_slug(title):
    """Create URL-friendly slug from Thai title"""
    # Remove special characters and spaces
    slug = re.sub(r'[^\w\u0E00-\u0E7F\s-]', '', title)
    # Replace spaces with dashes
    slug = re.sub(r'\s+', '-', slug)
    return slug.lower()

def extract_product_data(product_url):
    """Extract all product data from URL"""
    print(f"\n📦 Crawling: {product_url[:80]}...")
    
    soup = get_soup(product_url)
    if not soup:
        return None
    
    # Get title
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else ''
    
    if not title or len(title) < 3:
        return None
    
    # Get description
    description = ''
    content_div = soup.find('div', class_='entry-content')
    if content_div:
        paragraphs = content_div.find_all('p', recursive=False)
        description = ' '.join([p.get_text(strip=True) for p in paragraphs[:3]])
    
    # Get images
    images = []
    img_tags = soup.select('img[src*="wp-content"]')
    for img in img_tags[:5]:
        src = img.get('src')
        if src:
            images.append(src)
    
    # Get product name (English if available)
    name_en = ''
    if ' | ' in title:
        parts = title.split(' | ')
        if len(parts) > 1:
            name_en = parts[-1].strip()
    
    # Create slug
    slug = create_slug(title)
    
    # Download main image
    main_image = None
    if images:
        img_url = images[0]
        filename = os.path.basename(img_url)
        # Clean filename
        filename = re.sub(r'-\d+x\d+', '', filename)  # Remove size suffix
        downloaded = download_image(img_url, filename)
        if downloaded:
            main_image = f"/images/2021/03/{downloaded}"
    
    product_data = {
        'id': slug,
        'name': title,
        'nameEn': name_en,
        'slug': slug,
        'description': description[:500] if description else title,
        'shortDescription': description[:200] if description else title,
        'image': main_image or '/images/2021/03/hdpe-page-full.png',
        'keywords': [title.split()[0]] if title else [],
        'specifications': [],
        'features': [],
        'applications': [],
    }
    
    # Try to extract specifications from tables
    tables = soup.select('table')
    for table in tables[:2]:
        rows = table.select('tr')
        for row in rows:
            cells = row.select('td, th')
            if len(cells) == 2:
                label = cells[0].get_text(strip=True)
                value = cells[1].get_text(strip=True)
                if label and value and len(label) < 50:
                    product_data['specifications'].append({
                        'label': label,
                        'value': value
                    })
    
    print(f"  ✓ {title[:60]}")
    if main_image:
        print(f"  Image: {main_image}")
    if product_data['specifications']:
        print(f"  Specs: {len(product_data['specifications'])} found")
    
    return product_data

def create_markdown(product):
    """Create markdown file content"""
    md = f"""---
id: {product['id']}
name: {product['name']}
nameEn: {product['nameEn']}
slug: {product['slug']}
description: '{product['description'].replace("'", "''")}'
shortDescription: '{product['shortDescription'].replace("'", "''")}'
image: {product['image']}
keywords:
"""
    
    for keyword in product['keywords']:
        md += f"  - {keyword}\n"
    
    if product['specifications']:
        md += "specifications:\n"
        for spec in product['specifications']:
            md += f"  - label: {spec['label']}\n"
            md += f"    value: {spec['value']}\n"
    
    md += f"""---

# {product['name']}

{product['description']}

"""
    
    if product['specifications']:
        md += "## ข้อมูลจำเพาะ\n\n"
        md += "| รายการ | รายละเอียด |\n"
        md += "|--------|------------|\n"
        for spec in product['specifications']:
            md += f"| {spec['label']} | {spec['value']} |\n"
        md += "\n"
    
    return md

def get_all_product_urls():
    """Get all product URLs from main product page"""
    print("=== Getting all product URLs ===\n")
    
    soup = get_soup(f"{BASE_URL}/product/")
    if not soup:
        return []
    
    product_urls = set()
    
    # Find all links in content
    links = soup.select('a[href]')
    for link in links:
        href = link.get('href', '')
        text = link.get_text(strip=True)
        
        if href and text and len(text) > 3:
            # Filter for product pages
            if any(keyword in href.lower() for keyword in ['product', 'pipe', 'valve', 'pump', 'system', 'ท่อ', 'เครื่อง', 'อุปกรณ์']):
                full_url = urljoin(BASE_URL, href)
                if '/wp-' not in full_url and '#' not in full_url:
                    product_urls.add(full_url)
    
    print(f"Found {len(product_urls)} product URLs\n")
    return sorted(product_urls)

def main():
    print("="*60)
    print("CRAWLING DEAL PLUS TECH - ALL PRODUCTS")
    print("="*60)
    
    # Create directories
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs(IMAGE_DIR, exist_ok=True)
    
    # Get all product URLs
    product_urls = get_all_product_urls()
    
    if not product_urls:
        print("No products found! Exiting.")
        return
    
    # Crawl each product
    products_data = []
    for i, url in enumerate(product_urls, 1):
        print(f"\n[{i}/{len(product_urls)}]")
        data = extract_product_data(url)
        if data:
            products_data.append(data)
        
        # Be polite
        time.sleep(0.5)
    
    # Create markdown files
    print(f"\n{'='*60}")
    print(f"CREATING MARKDOWN FILES")
    print(f"{'='*60}\n")
    
    created = 0
    for product in products_data:
        filename = f"{product['id']}.md"
        filepath = os.path.join(OUTPUT_DIR, filename)
        
        # Skip if exists
        if os.path.exists(filepath):
            print(f"✓ Skip (exists): {filename}")
            continue
        
        # Create markdown
        md_content = create_markdown(product)
        
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(md_content)
            print(f"✓ Created: {filename}")
            created += 1
        except Exception as e:
            print(f"✗ Failed: {filename} - {e}")
    
    # Summary
    print(f"\n{'='*60}")
    print(f"✅ CRAWLING COMPLETE!")
    print(f"{'='*60}")
    print(f"📦 Total products crawled: {len(products_data)}")
    print(f"📝 New markdown files created: {created}")
    print(f"📁 Total products in folder: {len(os.listdir(OUTPUT_DIR))}")
    print(f"\nSaved to: {OUTPUT_DIR}")
    print(f"Images saved to: {IMAGE_DIR}")

if __name__ == '__main__':
    main()