#!/usr/bin/env python3 """ Crawl Deal Plus Tech original website to extract all products """ import requests from bs4 import BeautifulSoup import json import time from urllib.parse import urljoin, unquote import re BASE_URL = "https://www.dealplustech.co.th" def get_soup(url): """Get BeautifulSoup object from URL""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', } response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() return BeautifulSoup(response.text, 'html.parser') def extract_products_from_category(category_url): """Extract product links from category page""" soup = get_soup(category_url) products = [] # Find all product links product_links = soup.select('article a[href]') for link in product_links: href = link.get('href', '') if '/wp-content/uploads/' not in href: # Skip image links text = link.get_text(strip=True) if text and len(text) > 3: # Skip short text products.append({ 'title': text, 'url': urljoin(BASE_URL, href) }) return products def extract_product_details(product_url): """Extract detailed product information""" try: soup = get_soup(product_url) # Get title title = soup.find('h1') title = title.get_text(strip=True) if title else '' # Get description description = '' desc_div = soup.find('div', class_='entry-content') if desc_div: paragraphs = desc_div.find_all('p') description = '\n'.join([p.get_text(strip=True) for p in paragraphs[:5]]) # Get images images = [] img_tags = soup.select('img[src*="wp-content"]') for img in img_tags[:5]: # Get first 5 images src = img.get('src') if src: images.append(urljoin(BASE_URL, src)) return { 'title': title, 'url': product_url, 'description': description[:500], # Limit description length 'images': images } except Exception as e: print(f"Error extracting {product_url}: {e}") return None def main(): print("=== Crawling Deal Plus Tech Website ===\n") # Product categories to crawl categories = [ f"{BASE_URL}/product/", ] all_products = [] # Crawl main product page print("Crawling main product page...") soup = get_soup(f"{BASE_URL}/product/") # Extract all product links from navigation and content product_links = set() # Find links in main navigation nav_links = soup.select('nav a[href]') for link in nav_links: href = link.get('href', '') text = link.get_text(strip=True) if href and text and len(text) > 3: if any(keyword in href.lower() for keyword in ['ท่อ', 'pipe', 'valve', 'pump', 'system', 'เครื่อง', 'อุปกรณ์']): product_links.add(urljoin(BASE_URL, href)) # Find links in content content_links = soup.select('.entry-content a[href]') for link in content_links: href = link.get('href', '') text = link.get_text(strip=True) if href and text and len(text) > 3: product_links.add(urljoin(BASE_URL, href)) print(f"Found {len(product_links)} potential product links\n") # Extract details for each product products_data = [] for i, url in enumerate(sorted(product_links), 1): print(f"[{i}/{len(product_links)}] Extracting: {url[:80]}...") data = extract_product_details(url) if data and data['title']: products_data.append(data) print(f" ✓ {data['title'][:60]}") if data['images']: print(f" Images: {len(data['images'])} found") time.sleep(0.5) # Be polite to the server # Save results output_file = '/Users/kunthawatgreethong/Gitea/dealplustech/scripts/crawled_products.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(products_data, f, ensure_ascii=False, indent=2) print(f"\n✅ Crawling complete!") print(f"📦 Total products found: {len(products_data)}") print(f"💾 Saved to: {output_file}") # Print summary print("\n=== Product List ===") for i, product in enumerate(products_data, 1): print(f"{i}. {product['title']}") if __name__ == '__main__': main()