#!/usr/bin/env python3 """ Crawl all products from dealplustech.co.th and create markdown files """ import requests from bs4 import BeautifulSoup import json import os import time from urllib.parse import urljoin, unquote import re BASE_URL = "https://www.dealplustech.co.th" OUTPUT_DIR = "/Users/kunthawatgreethong/Gitea/dealplustech/src/content/products" IMAGE_DIR = "/Users/kunthawatgreethong/Gitea/dealplustech/public/images/2021/03" def get_soup(url): """Get BeautifulSoup object""" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} try: response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() return BeautifulSoup(response.text, 'html.parser') except Exception as e: print(f"Error fetching {url}: {e}") return None def download_image(url, filename): """Download image from URL""" try: if os.path.exists(os.path.join(IMAGE_DIR, filename)): return filename headers = {'User-Agent': 'Mozilla/5.0'} response = requests.get(url, headers=headers, timeout=30, stream=True) response.raise_for_status() filepath = os.path.join(IMAGE_DIR, filename) with open(filepath, 'wb') as f: for chunk in response.iter_content(8192): f.write(chunk) return filename except Exception as e: print(f" Failed to download {filename}: {e}") return None def create_slug(title): """Create URL-friendly slug from Thai title""" # Remove special characters and spaces slug = re.sub(r'[^\w\u0E00-\u0E7F\s-]', '', title) # Replace spaces with dashes slug = re.sub(r'\s+', '-', slug) return slug.lower() def extract_product_data(product_url): """Extract all product data from URL""" print(f"\n📦 Crawling: {product_url[:80]}...") soup = get_soup(product_url) if not soup: return None # Get title title_tag = soup.find('h1') title = title_tag.get_text(strip=True) if title_tag else '' if not title or len(title) < 3: return None # Get description description = '' content_div = soup.find('div', class_='entry-content') if content_div: paragraphs = content_div.find_all('p', recursive=False) description = ' '.join([p.get_text(strip=True) for p in paragraphs[:3]]) # Get images images = [] img_tags = soup.select('img[src*="wp-content"]') for img in img_tags[:5]: src = img.get('src') if src: images.append(src) # Get product name (English if available) name_en = '' if ' | ' in title: parts = title.split(' | ') if len(parts) > 1: name_en = parts[-1].strip() # Create slug slug = create_slug(title) # Download main image main_image = None if images: img_url = images[0] filename = os.path.basename(img_url) # Clean filename filename = re.sub(r'-\d+x\d+', '', filename) # Remove size suffix downloaded = download_image(img_url, filename) if downloaded: main_image = f"/images/2021/03/{downloaded}" product_data = { 'id': slug, 'name': title, 'nameEn': name_en, 'slug': slug, 'description': description[:500] if description else title, 'shortDescription': description[:200] if description else title, 'image': main_image or '/images/2021/03/hdpe-page-full.png', 'keywords': [title.split()[0]] if title else [], 'specifications': [], 'features': [], 'applications': [], } # Try to extract specifications from tables tables = soup.select('table') for table in tables[:2]: rows = table.select('tr') for row in rows: cells = row.select('td, th') if len(cells) == 2: label = cells[0].get_text(strip=True) value = cells[1].get_text(strip=True) if label and value and len(label) < 50: product_data['specifications'].append({ 'label': label, 'value': value }) print(f" ✓ {title[:60]}") if main_image: print(f" Image: {main_image}") if product_data['specifications']: print(f" Specs: {len(product_data['specifications'])} found") return product_data def create_markdown(product): """Create markdown file content""" md = f"""--- id: {product['id']} name: {product['name']} nameEn: {product['nameEn']} slug: {product['slug']} description: '{product['description'].replace("'", "''")}' shortDescription: '{product['shortDescription'].replace("'", "''")}' image: {product['image']} keywords: """ for keyword in product['keywords']: md += f" - {keyword}\n" if product['specifications']: md += "specifications:\n" for spec in product['specifications']: md += f" - label: {spec['label']}\n" md += f" value: {spec['value']}\n" md += f"""--- # {product['name']} {product['description']} """ if product['specifications']: md += "## ข้อมูลจำเพาะ\n\n" md += "| รายการ | รายละเอียด |\n" md += "|--------|------------|\n" for spec in product['specifications']: md += f"| {spec['label']} | {spec['value']} |\n" md += "\n" return md def get_all_product_urls(): """Get all product URLs from main product page""" print("=== Getting all product URLs ===\n") soup = get_soup(f"{BASE_URL}/product/") if not soup: return [] product_urls = set() # Find all links in content links = soup.select('a[href]') for link in links: href = link.get('href', '') text = link.get_text(strip=True) if href and text and len(text) > 3: # Filter for product pages if any(keyword in href.lower() for keyword in ['product', 'pipe', 'valve', 'pump', 'system', 'ท่อ', 'เครื่อง', 'อุปกรณ์']): full_url = urljoin(BASE_URL, href) if '/wp-' not in full_url and '#' not in full_url: product_urls.add(full_url) print(f"Found {len(product_urls)} product URLs\n") return sorted(product_urls) def main(): print("="*60) print("CRAWLING DEAL PLUS TECH - ALL PRODUCTS") print("="*60) # Create directories os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs(IMAGE_DIR, exist_ok=True) # Get all product URLs product_urls = get_all_product_urls() if not product_urls: print("No products found! Exiting.") return # Crawl each product products_data = [] for i, url in enumerate(product_urls, 1): print(f"\n[{i}/{len(product_urls)}]") data = extract_product_data(url) if data: products_data.append(data) # Be polite time.sleep(0.5) # Create markdown files print(f"\n{'='*60}") print(f"CREATING MARKDOWN FILES") print(f"{'='*60}\n") created = 0 for product in products_data: filename = f"{product['id']}.md" filepath = os.path.join(OUTPUT_DIR, filename) # Skip if exists if os.path.exists(filepath): print(f"✓ Skip (exists): {filename}") continue # Create markdown md_content = create_markdown(product) try: with open(filepath, 'w', encoding='utf-8') as f: f.write(md_content) print(f"✓ Created: {filename}") created += 1 except Exception as e: print(f"✗ Failed: {filename} - {e}") # Summary print(f"\n{'='*60}") print(f"✅ CRAWLING COMPLETE!") print(f"{'='*60}") print(f"📦 Total products crawled: {len(products_data)}") print(f"📝 New markdown files created: {created}") print(f"📁 Total products in folder: {len(os.listdir(OUTPUT_DIR))}") print(f"\nSaved to: {OUTPUT_DIR}") print(f"Images saved to: {IMAGE_DIR}") if __name__ == '__main__': main()