Files
dealplustech/scripts/crawl_products.py
Kunthawat c5de8282cf feat: Complete product catalog with images
 PRODUCTS:
- 7 product markdown files with full data
- All product images working correctly
- Specs, features, applications for each product
- SEO keywords included

 IMAGES:
- 96 images in public folder
- 15 new product images downloaded
- Correct image paths in all products

 BUILD:
- 16 pages building successfully
- All images load correctly
- Pure CSS (no Tailwind dependency)
- 8.7KB CSS bundle

 UX/UI:
- Modern responsive design
- Professional visual hierarchy
- Mobile-optimized
- Fast loading

Products included:
- ท่อ HDPE
- PP-R/PP-RCT POLOPLAST
- ท่อ PPR ตราช้าง (SCG)
- ท่อ PPR – Thai PPR
- ท่อไซเลอร์ (Syler)
- ท่อระบายน้ำ 3 ชั้น ไซเลนท์ (XYLENT)
- + 34 more products ready to add

Ready for production deployment!
2026-03-12 20:00:09 +07:00

141 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
Crawl Deal Plus Tech original website to extract all products
"""
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin, unquote
import re
BASE_URL = "https://www.dealplustech.co.th"
def get_soup(url):
"""Get BeautifulSoup object from URL"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.text, 'html.parser')
def extract_products_from_category(category_url):
"""Extract product links from category page"""
soup = get_soup(category_url)
products = []
# Find all product links
product_links = soup.select('article a[href]')
for link in product_links:
href = link.get('href', '')
if '/wp-content/uploads/' not in href: # Skip image links
text = link.get_text(strip=True)
if text and len(text) > 3: # Skip short text
products.append({
'title': text,
'url': urljoin(BASE_URL, href)
})
return products
def extract_product_details(product_url):
"""Extract detailed product information"""
try:
soup = get_soup(product_url)
# Get title
title = soup.find('h1')
title = title.get_text(strip=True) if title else ''
# Get description
description = ''
desc_div = soup.find('div', class_='entry-content')
if desc_div:
paragraphs = desc_div.find_all('p')
description = '\n'.join([p.get_text(strip=True) for p in paragraphs[:5]])
# Get images
images = []
img_tags = soup.select('img[src*="wp-content"]')
for img in img_tags[:5]: # Get first 5 images
src = img.get('src')
if src:
images.append(urljoin(BASE_URL, src))
return {
'title': title,
'url': product_url,
'description': description[:500], # Limit description length
'images': images
}
except Exception as e:
print(f"Error extracting {product_url}: {e}")
return None
def main():
print("=== Crawling Deal Plus Tech Website ===\n")
# Product categories to crawl
categories = [
f"{BASE_URL}/product/",
]
all_products = []
# Crawl main product page
print("Crawling main product page...")
soup = get_soup(f"{BASE_URL}/product/")
# Extract all product links from navigation and content
product_links = set()
# Find links in main navigation
nav_links = soup.select('nav a[href]')
for link in nav_links:
href = link.get('href', '')
text = link.get_text(strip=True)
if href and text and len(text) > 3:
if any(keyword in href.lower() for keyword in ['ท่อ', 'pipe', 'valve', 'pump', 'system', 'เครื่อง', 'อุปกรณ์']):
product_links.add(urljoin(BASE_URL, href))
# Find links in content
content_links = soup.select('.entry-content a[href]')
for link in content_links:
href = link.get('href', '')
text = link.get_text(strip=True)
if href and text and len(text) > 3:
product_links.add(urljoin(BASE_URL, href))
print(f"Found {len(product_links)} potential product links\n")
# Extract details for each product
products_data = []
for i, url in enumerate(sorted(product_links), 1):
print(f"[{i}/{len(product_links)}] Extracting: {url[:80]}...")
data = extract_product_details(url)
if data and data['title']:
products_data.append(data)
print(f"{data['title'][:60]}")
if data['images']:
print(f" Images: {len(data['images'])} found")
time.sleep(0.5) # Be polite to the server
# Save results
output_file = '/Users/kunthawatgreethong/Gitea/dealplustech/scripts/crawled_products.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(products_data, f, ensure_ascii=False, indent=2)
print(f"\n✅ Crawling complete!")
print(f"📦 Total products found: {len(products_data)}")
print(f"💾 Saved to: {output_file}")
# Print summary
print("\n=== Product List ===")
for i, product in enumerate(products_data, 1):
print(f"{i}. {product['title']}")
if __name__ == '__main__':
main()