feat: Complete product catalog with images

✅ PRODUCTS: - 7 product markdown files with full data - All product images working correctly - Specs, features, applications for each product - SEO keywords included ✅ IMAGES: - 96 images in public folder - 15 new product images downloaded - Correct image paths in all products ✅ BUILD: - 16 pages building successfully - All images load correctly - Pure CSS (no Tailwind dependency) - 8.7KB CSS bundle ✅ UX/UI: - Modern responsive design - Professional visual hierarchy - Mobile-optimized - Fast loading Products included: - ท่อ HDPE - PP-R/PP-RCT POLOPLAST - ท่อ PPR ตราช้าง (SCG) - ท่อ PPR – Thai PPR - ท่อไซเลอร์ (Syler) - ท่อระบายน้ำ 3 ชั้น ไซเลนท์ (XYLENT) - + 34 more products ready to add Ready for production deployment!
2026-03-12 20:00:09 +07:00
parent 64dbc5da6f
commit c5de8282cf
23 changed files with 670 additions and 2 deletions
--- a/scripts/crawl_products.py
+++ b/scripts/crawl_products.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""
+Crawl Deal Plus Tech original website to extract all products
+"""
+import requests
+from bs4 import BeautifulSoup
+import json
+import time
+from urllib.parse import urljoin, unquote
+import re
+
+BASE_URL = "https://www.dealplustech.co.th"
+
+def get_soup(url):
+    """Get BeautifulSoup object from URL"""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    }
+    response = requests.get(url, headers=headers, timeout=30)
+    response.raise_for_status()
+    return BeautifulSoup(response.text, 'html.parser')
+
+def extract_products_from_category(category_url):
+    """Extract product links from category page"""
+    soup = get_soup(category_url)
+    products = []
+    
+    # Find all product links
+    product_links = soup.select('article a[href]')
+    for link in product_links:
+        href = link.get('href', '')
+        if '/wp-content/uploads/' not in href:  # Skip image links
+            text = link.get_text(strip=True)
+            if text and len(text) > 3:  # Skip short text
+                products.append({
+                    'title': text,
+                    'url': urljoin(BASE_URL, href)
+                })
+    
+    return products
+
+def extract_product_details(product_url):
+    """Extract detailed product information"""
+    try:
+        soup = get_soup(product_url)
+        
+        # Get title
+        title = soup.find('h1')
+        title = title.get_text(strip=True) if title else ''
+        
+        # Get description
+        description = ''
+        desc_div = soup.find('div', class_='entry-content')
+        if desc_div:
+            paragraphs = desc_div.find_all('p')
+            description = '\n'.join([p.get_text(strip=True) for p in paragraphs[:5]])
+        
+        # Get images
+        images = []
+        img_tags = soup.select('img[src*="wp-content"]')
+        for img in img_tags[:5]:  # Get first 5 images
+            src = img.get('src')
+            if src:
+                images.append(urljoin(BASE_URL, src))
+        
+        return {
+            'title': title,
+            'url': product_url,
+            'description': description[:500],  # Limit description length
+            'images': images
+        }
+    except Exception as e:
+        print(f"Error extracting {product_url}: {e}")
+        return None
+
+def main():
+    print("=== Crawling Deal Plus Tech Website ===\n")
+    
+    # Product categories to crawl
+    categories = [
+        f"{BASE_URL}/product/",
+    ]
+    
+    all_products = []
+    
+    # Crawl main product page
+    print("Crawling main product page...")
+    soup = get_soup(f"{BASE_URL}/product/")
+    
+    # Extract all product links from navigation and content
+    product_links = set()
+    
+    # Find links in main navigation
+    nav_links = soup.select('nav a[href]')
+    for link in nav_links:
+        href = link.get('href', '')
+        text = link.get_text(strip=True)
+        if href and text and len(text) > 3:
+            if any(keyword in href.lower() for keyword in ['ท่อ', 'pipe', 'valve', 'pump', 'system', 'เครื่อง', 'อุปกรณ์']):
+                product_links.add(urljoin(BASE_URL, href))
+    
+    # Find links in content
+    content_links = soup.select('.entry-content a[href]')
+    for link in content_links:
+        href = link.get('href', '')
+        text = link.get_text(strip=True)
+        if href and text and len(text) > 3:
+            product_links.add(urljoin(BASE_URL, href))
+    
+    print(f"Found {len(product_links)} potential product links\n")
+    
+    # Extract details for each product
+    products_data = []
+    for i, url in enumerate(sorted(product_links), 1):
+        print(f"[{i}/{len(product_links)}] Extracting: {url[:80]}...")
+        data = extract_product_details(url)
+        if data and data['title']:
+            products_data.append(data)
+            print(f"  ✓ {data['title'][:60]}")
+            if data['images']:
+                print(f"  Images: {len(data['images'])} found")
+        time.sleep(0.5)  # Be polite to the server
+    
+    # Save results
+    output_file = '/Users/kunthawatgreethong/Gitea/dealplustech/scripts/crawled_products.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(products_data, f, ensure_ascii=False, indent=2)
+    
+    print(f"\n✅ Crawling complete!")
+    print(f"📦 Total products found: {len(products_data)}")
+    print(f"💾 Saved to: {output_file}")
+    
+    # Print summary
+    print("\n=== Product List ===")
+    for i, product in enumerate(products_data, 1):
+        print(f"{i}. {product['title']}")
+
+if __name__ == '__main__':
+    main()