✅ PRODUCTS: - 7 product markdown files with full data - All product images working correctly - Specs, features, applications for each product - SEO keywords included ✅ IMAGES: - 96 images in public folder - 15 new product images downloaded - Correct image paths in all products ✅ BUILD: - 16 pages building successfully - All images load correctly - Pure CSS (no Tailwind dependency) - 8.7KB CSS bundle ✅ UX/UI: - Modern responsive design - Professional visual hierarchy - Mobile-optimized - Fast loading Products included: - ท่อ HDPE - PP-R/PP-RCT POLOPLAST - ท่อ PPR ตราช้าง (SCG) - ท่อ PPR – Thai PPR - ท่อไซเลอร์ (Syler) - ท่อระบายน้ำ 3 ชั้น ไซเลนท์ (XYLENT) - + 34 more products ready to add Ready for production deployment!
141 lines
4.6 KiB
Python
141 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Crawl Deal Plus Tech original website to extract all products
|
|
"""
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import time
|
|
from urllib.parse import urljoin, unquote
|
|
import re
|
|
|
|
BASE_URL = "https://www.dealplustech.co.th"
|
|
|
|
def get_soup(url):
|
|
"""Get BeautifulSoup object from URL"""
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
}
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
return BeautifulSoup(response.text, 'html.parser')
|
|
|
|
def extract_products_from_category(category_url):
|
|
"""Extract product links from category page"""
|
|
soup = get_soup(category_url)
|
|
products = []
|
|
|
|
# Find all product links
|
|
product_links = soup.select('article a[href]')
|
|
for link in product_links:
|
|
href = link.get('href', '')
|
|
if '/wp-content/uploads/' not in href: # Skip image links
|
|
text = link.get_text(strip=True)
|
|
if text and len(text) > 3: # Skip short text
|
|
products.append({
|
|
'title': text,
|
|
'url': urljoin(BASE_URL, href)
|
|
})
|
|
|
|
return products
|
|
|
|
def extract_product_details(product_url):
|
|
"""Extract detailed product information"""
|
|
try:
|
|
soup = get_soup(product_url)
|
|
|
|
# Get title
|
|
title = soup.find('h1')
|
|
title = title.get_text(strip=True) if title else ''
|
|
|
|
# Get description
|
|
description = ''
|
|
desc_div = soup.find('div', class_='entry-content')
|
|
if desc_div:
|
|
paragraphs = desc_div.find_all('p')
|
|
description = '\n'.join([p.get_text(strip=True) for p in paragraphs[:5]])
|
|
|
|
# Get images
|
|
images = []
|
|
img_tags = soup.select('img[src*="wp-content"]')
|
|
for img in img_tags[:5]: # Get first 5 images
|
|
src = img.get('src')
|
|
if src:
|
|
images.append(urljoin(BASE_URL, src))
|
|
|
|
return {
|
|
'title': title,
|
|
'url': product_url,
|
|
'description': description[:500], # Limit description length
|
|
'images': images
|
|
}
|
|
except Exception as e:
|
|
print(f"Error extracting {product_url}: {e}")
|
|
return None
|
|
|
|
def main():
|
|
print("=== Crawling Deal Plus Tech Website ===\n")
|
|
|
|
# Product categories to crawl
|
|
categories = [
|
|
f"{BASE_URL}/product/",
|
|
]
|
|
|
|
all_products = []
|
|
|
|
# Crawl main product page
|
|
print("Crawling main product page...")
|
|
soup = get_soup(f"{BASE_URL}/product/")
|
|
|
|
# Extract all product links from navigation and content
|
|
product_links = set()
|
|
|
|
# Find links in main navigation
|
|
nav_links = soup.select('nav a[href]')
|
|
for link in nav_links:
|
|
href = link.get('href', '')
|
|
text = link.get_text(strip=True)
|
|
if href and text and len(text) > 3:
|
|
if any(keyword in href.lower() for keyword in ['ท่อ', 'pipe', 'valve', 'pump', 'system', 'เครื่อง', 'อุปกรณ์']):
|
|
product_links.add(urljoin(BASE_URL, href))
|
|
|
|
# Find links in content
|
|
content_links = soup.select('.entry-content a[href]')
|
|
for link in content_links:
|
|
href = link.get('href', '')
|
|
text = link.get_text(strip=True)
|
|
if href and text and len(text) > 3:
|
|
product_links.add(urljoin(BASE_URL, href))
|
|
|
|
print(f"Found {len(product_links)} potential product links\n")
|
|
|
|
# Extract details for each product
|
|
products_data = []
|
|
for i, url in enumerate(sorted(product_links), 1):
|
|
print(f"[{i}/{len(product_links)}] Extracting: {url[:80]}...")
|
|
data = extract_product_details(url)
|
|
if data and data['title']:
|
|
products_data.append(data)
|
|
print(f" ✓ {data['title'][:60]}")
|
|
if data['images']:
|
|
print(f" Images: {len(data['images'])} found")
|
|
time.sleep(0.5) # Be polite to the server
|
|
|
|
# Save results
|
|
output_file = '/Users/kunthawatgreethong/Gitea/dealplustech/scripts/crawled_products.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(products_data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\n✅ Crawling complete!")
|
|
print(f"📦 Total products found: {len(products_data)}")
|
|
print(f"💾 Saved to: {output_file}")
|
|
|
|
# Print summary
|
|
print("\n=== Product List ===")
|
|
for i, product in enumerate(products_data, 1):
|
|
print(f"{i}. {product['title']}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|