refactor(oi): update scraper for new QuikStrike website structure

- Replace direct product URL navigation with fixed heatmap URL and UI product selection
- Implement cookie validation with automatic session cleanup
- Update login flow to use SSO authentication and new form selectors
- Improve data extraction with iframe context and better table parsing
- Add multiple fallback selectors for gold price scraping
- Enhance error handling, logging, and timeout management
This commit is contained in:
Kunthawat Greethong
2026-01-06 12:16:53 +07:00
parent 28a4546cd8
commit 2e8e07ed17
5 changed files with 1411 additions and 83 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -1,15 +1,10 @@
# CME Group QuikStrike Login Credentials # CME Group QuikStrike Login Credentials
CME_USERNAME=your_username_here CME_USERNAME=your_username_here
CME_PASSWORD=your_password_here CME_PASSWORD=your_password_here
CME_LOGIN_URL=https://login.cmegroup.com/sso/accountstatus/showAuth.action
# Product Configuration # QuikStrike URL (fixed - always same page)
# Gold (XAUUSD/COMEX Gold - OG|GC): pid=40 QUIKSTRIKE_URL=https://www.cmegroup.com/tools-information/quikstrike/open-interest-heatmap.html
# Default product for XAUUSD trading
PRODUCT_URL=https://cmegroup.quikstrike.net/User/QuikStrikeView.aspx?pid=40&viewitemid=IntegratedOpenInterestTool
# Alternative products:
# SOFR (3M SOFR): https://cmegroup.quikstrike.net/User/QuikStrikeView.aspx?pid=476&viewitemid=IntegratedOpenInterestTool
# Silver: https://cmegroup.quikstrike.net/User/QuikStrikeView.aspx?pid=41&viewitemid=IntegratedOpenInterestTool
# Gold Price Source (investing.com) # Gold Price Source (investing.com)
INVESTING_URL=https://www.investing.com/commodities/gold INVESTING_URL=https://www.investing.com/commodities/gold

View File

@@ -92,6 +92,7 @@ FuturePrice,4345.50
Edit `.env` to customize: Edit `.env` to customize:
- `PRODUCT_URL` - QuikStrike product page URL (requires login) - `PRODUCT_URL` - QuikStrike product page URL (requires login)
- `CME_LOGIN_URL` - CME login page URL (default: SSO URL)
- `TOP_N_STRIKES` - Number of top strikes to export (default: 3) - `TOP_N_STRIKES` - Number of top strikes to export (default: 3)
- `HEADLESS` - Run browser in headless mode (default: false for debugging) - `HEADLESS` - Run browser in headless mode (default: false for debugging)
- `CSV_OUTPUT_PATH` - Output CSV file path - `CSV_OUTPUT_PATH` - Output CSV file path

View File

@@ -10,9 +10,14 @@ load_dotenv()
# Configuration # Configuration
CME_USERNAME = os.getenv("CME_USERNAME") CME_USERNAME = os.getenv("CME_USERNAME")
CME_PASSWORD = os.getenv("CME_PASSWORD") CME_PASSWORD = os.getenv("CME_PASSWORD")
PRODUCT_URL = os.getenv( CME_LOGIN_URL = os.getenv(
"PRODUCT_URL", "CME_LOGIN_URL", "https://login.cmegroup.com/sso/accountstatus/showAuth.action"
"https://cmegroup.quikstrike.net/User/QuikStrikeView.aspx?pid=40&viewitemid=IntegratedOpenInterestTool", )
QUIKSTRIKE_URL = (
"https://www.cmegroup.com/tools-information/quikstrike/open-interest-heatmap.html"
)
QUIKSTRIKE_URL = (
"https://www.cmegroup.com/tools-information/quikstrike/open-interest-heatmap.html"
) )
INVESTING_URL = os.getenv("INVESTING_URL", "https://www.investing.com/commodities/gold") INVESTING_URL = os.getenv("INVESTING_URL", "https://www.investing.com/commodities/gold")
CSV_OUTPUT_PATH = os.getenv("CSV_OUTPUT_PATH", "./oi_data.csv") CSV_OUTPUT_PATH = os.getenv("CSV_OUTPUT_PATH", "./oi_data.csv")
@@ -44,28 +49,53 @@ def load_cookies(context):
return False return False
def is_logged_in(page): def delete_cookies():
page.goto(PRODUCT_URL, timeout=TIMEOUT_SECONDS * 1000) if os.path.exists(COOKIE_FILE):
page.wait_for_load_state("networkidle", timeout=TIMEOUT_SECONDS * 1000) os.remove(COOKIE_FILE)
return "login" not in page.url.lower() logger.info("Cookies deleted")
def are_cookies_valid(page):
logger.info("Checking if cookies are valid...")
page.goto(QUIKSTRIKE_URL, timeout=TIMEOUT_SECONDS * 1000)
page.wait_for_load_state("domcontentloaded", timeout=TIMEOUT_SECONDS * 1000)
page.wait_for_timeout(3000)
try:
frame = page.frame_locator("iframe.cmeIframe").first
page.wait_for_timeout(5000)
table_exists = frame.locator("table.grid-thm").count() > 0
if table_exists:
logger.info("Cookies are valid - OI table found in iframe")
else:
logger.info("Cookies may be expired - no OI table found in iframe")
return table_exists
except Exception as e:
logger.info(f"Cookies expired - error checking iframe: {e}")
return False
def login_to_cme(page): def login_to_cme(page):
logger.info("Attempting to login to CME QuikStrike...") logger.info("Attempting to login to CME QuikStrike...")
page.goto( page.goto(CME_LOGIN_URL, timeout=TIMEOUT_SECONDS * 1000)
"https://www.cmegroup.com/account/login.html", timeout=TIMEOUT_SECONDS * 1000 page.wait_for_load_state("domcontentloaded", timeout=TIMEOUT_SECONDS * 1000)
) page.wait_for_timeout(1000)
try: try:
page.fill('input[name="username"]', CME_USERNAME) page.fill("#user", CME_USERNAME)
page.fill('input[name="password"]', CME_PASSWORD) page.fill("#pwd", CME_PASSWORD)
page.click('button[type="submit"]') page.click("#loginBtn")
page.wait_for_load_state("networkidle", timeout=TIMEOUT_SECONDS * 1000) logger.info("Waiting for login redirect...")
page.wait_for_timeout(30000)
if "login" in page.url.lower(): current_url = page.url.lower()
logger.error("Login failed - still on login page") logger.info(f"Current URL after login attempt: {current_url}")
if "login" in current_url or "sso" in current_url:
logger.error("Login may have failed - still on SSO/login page")
page.screenshot(path="login_failed.png") page.screenshot(path="login_failed.png")
return False return False
@@ -79,20 +109,59 @@ def login_to_cme(page):
return False return False
def select_gold_product(page):
logger.info("Selecting Gold product...")
logger.info("Switching to iframe context...")
frame = page.frame_locator("iframe.cmeIframe").first
page.wait_for_timeout(5000)
logger.info("Step 1: Clicking dropdown arrow...")
frame.locator("#ctl11_hlProductArrow").click()
page.wait_for_timeout(1000)
logger.info("Step 2: Clicking Metals...")
frame.locator('a[groupid="6"]:has-text("Metals")').click()
page.wait_for_timeout(500)
logger.info("Step 3: Clicking Precious Metals...")
frame.locator('a[familyid="6"]:has-text("Precious Metals")').click()
page.wait_for_timeout(500)
logger.info("Step 4: Clicking Gold...")
frame.locator('a[title="Gold"]').click()
logger.info("Waiting for Gold data to load...")
page.wait_for_timeout(10000)
logger.info("Gold product selected")
def navigate_to_oi_heatmap(page): def navigate_to_oi_heatmap(page):
logger.info(f"Navigating to OI Heatmap: {PRODUCT_URL}") logger.info(f"Navigating to QuikStrike: {QUIKSTRIKE_URL}")
page.goto(PRODUCT_URL, timeout=TIMEOUT_SECONDS * 1000) page.goto(QUIKSTRIKE_URL, timeout=TIMEOUT_SECONDS * 1000)
page.wait_for_load_state("networkidle", timeout=TIMEOUT_SECONDS * 1000) page.wait_for_load_state("domcontentloaded", timeout=TIMEOUT_SECONDS * 1000)
page.wait_for_timeout(5000)
select_gold_product(page)
def extract_oi_data(page): def extract_oi_data(page):
logger.info("Extracting OI data from Gold matrix table...") logger.info("Extracting OI data from Gold matrix table...")
logger.info("Switching to iframe context...")
frame = page.frame_locator("iframe.cmeIframe").first
page.wait_for_timeout(8000)
logger.info("Looking for table.grid-thm...")
call_levels = [] call_levels = []
put_levels = [] put_levels = []
table = page.locator("table.grid-thm").first table = frame.locator("table.grid-thm").first
table.wait_for(state="visible", timeout=10000)
logger.info("Table found, waiting for data...")
rows = table.locator("tbody tr").all() rows = table.locator("tbody tr").all()
logger.info(f"Found {len(rows)} rows in table")
for row in rows: for row in rows:
try: try:
@@ -100,31 +169,36 @@ def extract_oi_data(page):
if len(cells) < 3: if len(cells) < 3:
continue continue
strike_cell = cells[0].text_content().strip() strike = None
if not strike_cell or not strike_cell.replace(".", "").isdigit(): for cell in cells:
continue text = cell.text_content().strip()
if text and text.replace(".", "").isdigit():
strike = float(strike_cell) strike = float(text)
cells_with_data = cells[2:]
for i in range(0, len(cells_with_data), 2):
if i + 1 >= len(cells_with_data):
break break
call_cell = cells_with_data[i] if strike is None:
put_cell = cells_with_data[i + 1] continue
number_cells = row.locator("td.number").all()
logger.debug(f"Strike {strike}: found {len(number_cells)} number cells")
for i in range(0, len(number_cells), 2):
if i + 1 >= len(number_cells):
break
call_cell = number_cells[i]
put_cell = number_cells[i + 1]
call_text = call_cell.text_content().strip() call_text = call_cell.text_content().strip()
put_text = put_cell.text_content().strip() put_text = put_cell.text_content().strip()
if call_text and call_text.replace(",", "").isdigit(): if call_text and call_text != "-":
call_oi = int(call_text.replace(",", "")) call_oi = int(call_text.replace(",", ""))
call_levels.append( call_levels.append(
{"Type": "CALL", "Strike": strike, "OI": call_oi} {"Type": "CALL", "Strike": strike, "OI": call_oi}
) )
if put_text and put_text.replace(",", "").isdigit(): if put_text and put_text != "-":
put_oi = int(put_text.replace(",", "")) put_oi = int(put_text.replace(",", ""))
put_levels.append({"Type": "PUT", "Strike": strike, "OI": put_oi}) put_levels.append({"Type": "PUT", "Strike": strike, "OI": put_oi})
@@ -132,25 +206,27 @@ def extract_oi_data(page):
logger.warning(f"Error parsing row: {e}") logger.warning(f"Error parsing row: {e}")
continue continue
if not call_levels: logger.info(
logger.warning("No CALL OI data extracted") f"Extracted {len(call_levels)} CALL levels, {len(put_levels)} PUT levels"
if not put_levels: )
logger.warning("No PUT OI data extracted")
call_df = ( if call_levels:
pd.DataFrame(call_levels).nlargest(TOP_N_STRIKES, "OI") call_df = pd.DataFrame(call_levels)
if call_levels call_df = call_df.groupby("Strike", as_index=False).agg({"OI": "max"})
else pd.DataFrame() call_df = call_df.nlargest(TOP_N_STRIKES, "OI")
) else:
put_df = ( call_df = pd.DataFrame()
pd.DataFrame(put_levels).nlargest(TOP_N_STRIKES, "OI")
if put_levels if put_levels:
else pd.DataFrame() put_df = pd.DataFrame(put_levels)
) put_df = put_df.groupby("Strike", as_index=False).agg({"OI": "max"})
put_df = put_df.nlargest(TOP_N_STRIKES, "OI")
else:
put_df = pd.DataFrame()
result_df = pd.concat([call_df, put_df], ignore_index=True) result_df = pd.concat([call_df, put_df], ignore_index=True)
logger.info(f"Extracted {len(result_df)} OI levels") logger.info(f"Final top {TOP_N_STRIKES} unique strikes for CALL and PUT extracted")
return result_df return result_df
@@ -158,29 +234,39 @@ def scrape_investing_gold_price(page):
logger.info(f"Scraping gold price from: {INVESTING_URL}") logger.info(f"Scraping gold price from: {INVESTING_URL}")
try: try:
page.goto(INVESTING_URL, timeout=TIMEOUT_SECONDS * 1000) page.goto(INVESTING_URL, timeout=60000, wait_until="domcontentloaded")
page.wait_for_load_state("domcontentloaded", timeout=TIMEOUT_SECONDS * 1000) logger.info(f"Page loaded, title: {page.title()}")
price_locator = page.locator('div[data-test="instrument-price-last"]') page.wait_for_timeout(5000)
logger.info("Waited for JavaScript to render")
if price_locator.count() > 0: selectors = [
price_text = price_locator.text_content().strip() 'div[data-test="instrument-price-last"]',
".text-5xl\\/9.font-bold.text-\\[#232526\\]",
'[data-test="instrument-price-last"]',
".text-5xl\\/9",
]
price = 0.0
for selector in selectors:
try:
locator = page.locator(selector)
if locator.count() > 0:
locator.first.wait_for(state="visible", timeout=10000)
price_text = locator.first.text_content().strip()
if price_text:
price_text = price_text.replace(",", "") price_text = price_text.replace(",", "")
price = float(price_text) price = float(price_text)
logger.info(f"Extracted gold price: {price}") logger.info(f"Extracted gold price ({selector}): {price}")
return price break
else: except Exception as e:
logger.warning("Price element not found, trying alternative selector") logger.debug(f"Selector {selector} failed: {e}")
alt_locator = page.locator(".text-5xl\\/9") continue
if alt_locator.count() > 0:
price_text = alt_locator.text_content().strip()
price_text = price_text.replace(",", "")
price = float(price_text)
logger.info(f"Extracted gold price (alt): {price}")
return price
logger.warning("Could not extract gold price") if price == 0.0:
return 0.0 logger.warning("Could not extract gold price, all selectors failed")
return price
except Exception as e: except Exception as e:
logger.error(f"Error scraping gold price: {e}") logger.error(f"Error scraping gold price: {e}")
@@ -212,13 +298,20 @@ def run_scraper():
context = browser.new_context() context = browser.new_context()
page = context.new_page() page = context.new_page()
loaded_cookies = load_cookies(context) cookies_loaded = load_cookies(context)
page2 = context.new_page() cookies_valid = False
if loaded_cookies and is_logged_in(page2): if cookies_loaded:
logger.info("Using existing session (cookies)") cookies_valid = are_cookies_valid(page)
if cookies_valid:
logger.info("Using cached session")
else: else:
logger.info("No valid session found, logging in...") if cookies_loaded:
logger.info("Cookies expired, deleting and re-logging in...")
delete_cookies()
logger.info("Logging in to CME...")
if not login_to_cme(page): if not login_to_cme(page):
browser.close() browser.close()
if attempt < RETRY_ATTEMPTS - 1: if attempt < RETRY_ATTEMPTS - 1:
@@ -229,14 +322,15 @@ def run_scraper():
else: else:
logger.error("All login attempts failed") logger.error("All login attempts failed")
return return
save_cookies(context)
navigate_to_oi_heatmap(page) navigate_to_oi_heatmap(page)
oi_data = extract_oi_data(page) oi_data = extract_oi_data(page)
save_cookies(context)
if not oi_data.empty: if not oi_data.empty:
logger.info("Extracting gold price from investing.com...") logger.info("Extracting gold price from investing.com...")
future_price = scrape_investing_gold_price(page) future_price = scrape_investing_gold_price(page)
logger.info(f"Gold price extracted: {future_price}")
export_to_csv(oi_data, future_price) export_to_csv(oi_data, future_price)
else: else: