hvac-kia-content/src/tiktok_scraper_advanced.py
Ben Reed daab901e35 refactor: Update naming convention from hvacknowitall to hkia
Major Changes:
- Updated all code references from hvacknowitall/hvacnkowitall to hkia
- Renamed all existing markdown files to use hkia_ prefix
- Updated configuration files, scrapers, and production scripts
- Modified systemd service descriptions to use HKIA
- Changed NAS sync path to /mnt/nas/hkia

Files Updated:
- 20+ source files updated with new naming convention
- 34 markdown files renamed to hkia_* format
- All ScraperConfig brand_name parameters now use 'hkia'
- Documentation updated to reflect new naming

Rationale:
- Shorter, cleaner filenames
- Consistent branding across all outputs
- Easier to type and reference
- Maintains same functionality with improved naming

Next Steps:
- Deploy updated services to production
- Update any external references to old naming
- Monitor scrapers to ensure proper operation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 13:35:23 -03:00

617 lines
No EOL
27 KiB
Python

import os
import time
import random
from typing import Any, Dict, List, Optional
from datetime import datetime, timedelta
from pathlib import Path
import json
import re
from scrapling import StealthyFetcher, Adaptor
from src.base_scraper import BaseScraper, ScraperConfig
class TikTokScraperAdvanced(BaseScraper):
"""TikTok scraper using advanced Scrapling configuration for bot detection avoidance."""
def __init__(self, config: ScraperConfig):
super().__init__(config)
self.target_username = os.getenv('TIKTOK_TARGET', 'hkia')
self.base_url = f"https://www.tiktok.com/@{self.target_username}"
# Configure global StealthyFetcher settings
StealthyFetcher.auto_match = True # Enable automatic element matching
StealthyFetcher.huge_tree = True # Allow large HTML trees
def _enhanced_typing(self, element, text: str):
"""Realistic typing patterns (30-70 WPM with typos)"""
for char in text:
# Variable typing speed
base_delay = random.uniform(0.08, 0.25)
# Pause on complex characters
if char in '@._-':
base_delay *= random.uniform(1.2, 2.0)
# Occasional hesitation (10% chance)
if random.random() < 0.1:
time.sleep(random.uniform(0.3, 0.8))
element.type(char)
time.sleep(base_delay)
# Typo correction (3% chance)
if random.random() < 0.03:
element.press('Backspace')
time.sleep(random.uniform(0.1, 0.3))
element.type(char)
def _advanced_human_simulation(self, page):
"""Natural page reading behavior"""
try:
viewport_height = page.viewport_size.get('height', 800)
# Natural scrolling patterns
for i in range(random.randint(3, 6)):
scroll_amount = random.randint(100, viewport_height // 3)
page.mouse.wheel(0, scroll_amount)
time.sleep(random.uniform(0.8, 2.5)) # Reading time
# Occasional back-scroll (re-reading)
if random.random() < 0.3:
page.mouse.wheel(0, -random.randint(50, 150))
# Random mouse movements
for _ in range(random.randint(2, 4)):
x = random.randint(100, page.viewport_size.get('width', 1200) - 100)
y = random.randint(100, page.viewport_size.get('height', 800) - 100)
page.mouse.move(x, y)
time.sleep(random.uniform(0.3, 0.8))
except Exception as e:
self.logger.debug(f"Human simulation error (non-critical): {e}")
def _human_delay(self, min_seconds: float = 2, max_seconds: float = 5) -> None:
"""Add human-like delays between actions."""
delay = random.uniform(min_seconds, max_seconds)
self.logger.debug(f"Waiting {delay:.2f} seconds (human-like delay)...")
time.sleep(delay)
def fetch_posts(self, max_posts: int = 20, enable_scrolling: bool = True) -> List[Dict[str, Any]]:
"""Fetch posts from TikTok profile using advanced stealth configuration.
Args:
max_posts: Maximum number of posts to fetch
enable_scrolling: Whether to scroll profile page to load more videos
"""
posts_data = []
try:
self.logger.info(f"Fetching TikTok posts from @{self.target_username}")
# Advanced stealth configuration for TikTok
self.logger.info(f"Loading {self.base_url} with advanced stealth settings...")
response = StealthyFetcher.fetch(
url=self.base_url,
# Display and stealth settings
headless=False, # Visible browser for manual CAPTCHA intervention
# Network and resource management
block_webrtc=True, # Prevent WebRTC IP leaks
allow_webgl=True, # CRITICAL: Required for modern anti-bot detection
block_images=False, # Keep images for CAPTCHA visibility
disable_ads=True, # Block ads for cleaner experience
disable_resources=False, # Keep all resources to avoid detection
# Geographic and fingerprinting
geoip=True, # Automatic geolocation spoofing
os_randomize=True, # Randomize OS fingerprints
google_search=True, # Set Google as referrer
# Humanization and behavior
humanize=True, # Enable human-like mouse movements
# Performance and timing
network_idle=True, # Wait for network idle state
timeout=120000, # 2 minute timeout (reduced for testing)
wait=3000, # 3 second wait after page load
# Enhanced headers for better compatibility
extra_headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9,en-CA;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "max-age=0",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1"
}
)
if not response:
self.logger.error("Failed to load TikTok profile")
return posts_data
self.logger.info("Page loaded successfully, performing human simulation...")
# Perform advanced human simulation if we have access to the page object
try:
# Note: This would need to be adapted based on Scrapling's API
# self._advanced_human_simulation(page)
pass
except Exception as e:
self.logger.debug(f"Human simulation not available: {e}")
# Wait for human-like delay
self._human_delay(3, 6)
# Optional: Scroll to load more videos
if enable_scrolling and max_posts > 20:
self.logger.info(f"Scrolling to load more videos (targeting {max_posts} posts)...")
# Simulate scrolling to trigger lazy loading
for scroll_attempt in range(min(5, max_posts // 10)):
try:
# Scroll down progressively
self.logger.debug(f"Scroll attempt {scroll_attempt + 1}")
# Note: This would need adaptation based on Scrapling's API
# for actual scrolling implementation
self._human_delay(2, 4)
except Exception as e:
self.logger.debug(f"Scrolling error (non-critical): {e}")
break
# Extract video items using multiple strategies
video_items = []
# Strategy 1: Primary TikTok selectors
video_items = response.css("[data-e2e='user-post-item']")
self.logger.info(f"Strategy 1 found {len(video_items)} items with user-post-item selector")
# Strategy 2: Alternative selectors
if not video_items:
video_items = response.css("div[class*='DivItemContainer']")
self.logger.info(f"Strategy 2 found {len(video_items)} items with DivItemContainer selector")
if not video_items:
video_items = response.css("div[class*='video-feed-item']")
self.logger.info(f"Strategy 3 found {len(video_items)} items with video-feed-item selector")
# Strategy 3: Look for video links directly
if not video_items:
video_links = response.css("a[href*='/video/']")
self.logger.info(f"Strategy 4 found {len(video_links)} direct video links")
for idx, link in enumerate(video_links[:max_posts]):
try:
href = ""
# Extract href using ::attr() pseudo-selector
href_elements = response.css(f"a[href*='/video/']:nth-child({idx+1})::attr(href)")
if href_elements:
href = href_elements[0]
if not href:
continue
if not href.startswith('http'):
href = f"https://www.tiktok.com{href}"
video_id_match = re.search(r'/video/(\d+)', href)
video_id = video_id_match.group(1) if video_id_match else f"video_{idx}"
post_data = {
'id': video_id,
'type': 'video',
'caption': '',
'author': self.target_username,
'publish_date': datetime.now(self.tz).isoformat(),
'link': href,
'views': 0,
'platform': 'tiktok'
}
posts_data.append(post_data)
except Exception as e:
self.logger.error(f"Error processing video link {idx}: {e}")
continue
# Strategy 4: Process structured video items
if video_items and not posts_data:
self.logger.info(f"Processing {len(video_items)} structured video items...")
for idx, item in enumerate(video_items[:max_posts]):
try:
# Extract video URL using ::attr() selector
video_url = ""
href_elements = item.css("a[href*='/video/']::attr(href)")
if href_elements:
video_url = href_elements[0]
if not video_url:
# Try alternative approach
link_elements = item.css("a")
for link_elem in link_elements:
href_attrs = link_elem.css("::attr(href)")
if href_attrs and '/video/' in str(href_attrs[0]):
video_url = href_attrs[0]
break
if not video_url:
continue
if not video_url.startswith('http'):
video_url = f"https://www.tiktok.com{video_url}"
# Extract video ID from URL
video_id_match = re.search(r'/video/(\d+)', video_url)
video_id = video_id_match.group(1) if video_id_match else f"video_{idx}"
# Extract caption/description using ::text selector
caption = ""
caption_elements = item.css("div[data-e2e='browse-video-desc'] span::text")
if caption_elements:
caption = caption_elements[0] if isinstance(caption_elements, list) else str(caption_elements)
if not caption:
caption_elements = item.css("div[class*='DivContainer'] span::text")
if caption_elements:
caption = caption_elements[0] if isinstance(caption_elements, list) else str(caption_elements)
# Extract view count using ::text selector
views_text = "0"
views_elements = item.css("strong[data-e2e='video-views']::text")
if views_elements:
views_text = views_elements[0] if isinstance(views_elements, list) else str(views_elements)
if not views_text or views_text == "0":
views_elements = item.css("strong::text")
if views_elements:
views_text = views_elements[0] if isinstance(views_elements, list) else str(views_elements)
views = self._parse_count(views_text)
post_data = {
'id': video_id,
'type': 'video',
'caption': caption,
'author': self.target_username,
'publish_date': datetime.now(self.tz).isoformat(),
'link': video_url,
'views': views,
'platform': 'tiktok'
}
posts_data.append(post_data)
if idx % 5 == 0 and idx > 0:
self.logger.info(f"Processed {idx} videos...")
except Exception as e:
self.logger.error(f"Error processing video item {idx}: {e}")
continue
# Strategy 5: Extract from page scripts as fallback
if not posts_data:
self.logger.info("No posts found via selectors, checking page scripts...")
scripts = response.css("script")
for script in scripts:
script_text_elements = script.css("::text")
if not script_text_elements:
continue
script_text = script_text_elements[0] if isinstance(script_text_elements, list) else str(script_text_elements)
if '__UNIVERSAL_DATA_FOR_REHYDRATION__' in script_text or 'window.__INIT_PROPS__' in script_text:
try:
# Look for video IDs in the script content
urls = re.findall(r'["\']*/video/(\d+)["\']', script_text)
unique_ids = list(set(urls)) # Remove duplicates
self.logger.info(f"Found {len(unique_ids)} unique video IDs in script data")
for video_id in unique_ids[:max_posts]:
post_data = {
'id': video_id,
'type': 'video',
'caption': '',
'author': self.target_username,
'publish_date': datetime.now(self.tz).isoformat(),
'link': f"https://www.tiktok.com/@{self.target_username}/video/{video_id}",
'views': 0,
'platform': 'tiktok'
}
posts_data.append(post_data)
except Exception as e:
self.logger.debug(f"Could not parse script data: {e}")
continue
self.logger.info(f"Successfully fetched {len(posts_data)} TikTok posts")
except Exception as e:
self.logger.error(f"Error fetching TikTok posts: {e}")
import traceback
self.logger.error(traceback.format_exc())
return posts_data
def _fetch_video_details(self, video_url: str) -> Optional[Dict[str, Any]]:
"""Fetch detailed information from an individual TikTok video page.
Args:
video_url: URL of the TikTok video
Returns:
Dictionary with caption and additional metadata, or None if failed
"""
try:
self.logger.debug(f"Fetching details for: {video_url}")
# Fetch individual video page with stealth settings
video_response = StealthyFetcher.fetch(
url=video_url,
headless=False,
block_webrtc=True,
allow_webgl=True,
block_images=False,
disable_ads=True,
geoip=True,
os_randomize=True,
google_search=True,
humanize=True,
network_idle=True,
timeout=60000, # 1 minute timeout for individual pages
wait=2000,
extra_headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Upgrade-Insecure-Requests": "1"
}
)
if not video_response:
self.logger.warning(f"Failed to load video page: {video_url}")
return None
details = {}
# Extract caption/description from video page
caption_selectors = [
"h1[data-e2e='browse-video-desc']",
"div[data-e2e='browse-video-desc']",
"span[data-e2e='browse-video-desc']",
"div.video-meta-caption",
"div[class*='DivVideoInfoContainer'] span",
"h1.video-meta-title",
"meta[property='og:description']::attr(content)"
]
caption = ""
for selector in caption_selectors:
try:
caption_elements = video_response.css(f"{selector}::text")
if caption_elements:
caption = ' '.join(str(elem).strip() for elem in caption_elements if elem)
if caption:
self.logger.debug(f"Found caption with selector: {selector}")
break
except:
continue
details['caption'] = caption
# Try to extract additional metadata
# Likes
likes_elements = video_response.css("strong[data-e2e='like-count']::text")
if likes_elements:
details['likes'] = self._parse_count(str(likes_elements[0]))
# Comments
comments_elements = video_response.css("strong[data-e2e='comment-count']::text")
if comments_elements:
details['comments'] = self._parse_count(str(comments_elements[0]))
# Shares
shares_elements = video_response.css("strong[data-e2e='share-count']::text")
if shares_elements:
details['shares'] = self._parse_count(str(shares_elements[0]))
# Duration
duration_elements = video_response.css("div[class*='DivSeekBarTimeContainer'] div::text")
if duration_elements and len(duration_elements) >= 2:
details['duration'] = str(duration_elements[1])
return details
except Exception as e:
self.logger.error(f"Error fetching video details from {video_url}: {e}")
return None
def _parse_count(self, count_str: str) -> int:
"""Parse TikTok view/like counts (e.g., '1.2M' -> 1200000)."""
if not count_str:
return 0
count_str = str(count_str).strip().upper()
try:
if 'K' in count_str:
num = re.search(r'([\d.]+)', count_str)
if num:
return int(float(num.group(1)) * 1000)
elif 'M' in count_str:
num = re.search(r'([\d.]+)', count_str)
if num:
return int(float(num.group(1)) * 1000000)
elif 'B' in count_str:
num = re.search(r'([\d.]+)', count_str)
if num:
return int(float(num.group(1)) * 1000000000)
else:
# Remove any non-numeric characters
return int(re.sub(r'[^\d]', '', count_str) or 0)
except:
return 0
def fetch_content(self, max_posts: int = 20, fetch_captions: bool = False,
max_caption_fetches: int = 10) -> List[Dict[str, Any]]:
"""Fetch all content from TikTok with optional caption retrieval.
Args:
max_posts: Maximum number of posts to fetch
fetch_captions: Whether to fetch captions from individual video pages
max_caption_fetches: Maximum number of videos to fetch captions for
"""
# First, get video IDs and basic info from profile
posts_data = self.fetch_posts(max_posts=max_posts, enable_scrolling=(max_posts > 20))
# Optionally fetch captions from individual video pages
if fetch_captions and posts_data:
caption_limit = min(len(posts_data), max_caption_fetches)
self.logger.info(f"Fetching captions for {caption_limit} videos (this will take time)...")
successful_fetches = 0
for i, post in enumerate(posts_data[:caption_limit]):
try:
# Aggressive delay before each fetch to avoid detection
self._human_delay(5, 10)
# Fetch individual video details
video_url = post.get('link', '')
if not video_url:
continue
self.logger.info(f"Fetching caption {i+1}/{caption_limit}: {video_url}")
video_details = self._fetch_video_details(video_url)
if video_details:
# Update post with fetched details
post.update(video_details)
successful_fetches += 1
self.logger.info(f"Successfully fetched caption ({successful_fetches}/{caption_limit})")
# Extended break every 3 videos to avoid detection
if (i + 1) % 3 == 0 and i < caption_limit - 1:
break_time = random.uniform(30, 60)
self.logger.info(f"Taking extended {break_time:.0f}s break to avoid detection...")
time.sleep(break_time)
except Exception as e:
self.logger.warning(f"Failed to fetch details for video {i+1}: {e}")
continue
self.logger.info(f"Caption fetching complete: {successful_fetches}/{caption_limit} successful")
return posts_data
def format_markdown(self, items: List[Dict[str, Any]]) -> str:
"""Format TikTok content as markdown."""
markdown_sections = []
for item in items:
section = []
# ID
section.append(f"# ID: {item.get('id', 'N/A')}")
section.append("")
# Type
section.append(f"## Type: {item.get('type', 'video')}")
section.append("")
# Author
section.append(f"## Author: @{item.get('author', 'Unknown')}")
section.append("")
# Publish Date
section.append(f"## Publish Date: {item.get('publish_date', '')}")
section.append("")
# Link
section.append(f"## Link: {item.get('link', '')}")
section.append("")
# Views
views = item.get('views', 0)
section.append(f"## Views: {views:,}")
section.append("")
# Likes (if fetched from individual page)
likes = item.get('likes')
if likes is not None:
section.append(f"## Likes: {likes:,}")
section.append("")
# Comments (if fetched from individual page)
comments = item.get('comments')
if comments is not None:
section.append(f"## Comments: {comments:,}")
section.append("")
# Shares (if fetched from individual page)
shares = item.get('shares')
if shares is not None:
section.append(f"## Shares: {shares:,}")
section.append("")
# Duration (if fetched from individual page)
duration = item.get('duration')
if duration:
section.append(f"## Duration: {duration}")
section.append("")
# Caption
section.append("## Caption:")
caption = item.get('caption', '')
if caption:
section.append(caption)
else:
section.append("(No caption available - fetch individual video for details)")
section.append("")
# Separator
section.append("-" * 50)
section.append("")
markdown_sections.append('\n'.join(section))
return '\n'.join(markdown_sections)
def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Get only new videos since last sync."""
if not state:
return items
last_video_id = state.get('last_video_id')
if not last_video_id:
return items
# Filter for videos newer than the last synced
new_items = []
for item in items:
if item.get('id') == last_video_id:
break # Found the last synced video
new_items.append(item)
return new_items
def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Update state with latest video information."""
if not items:
return state
# Get the first item (most recent)
latest_item = items[0]
state['last_video_id'] = latest_item.get('id')
state['last_video_date'] = latest_item.get('publish_date')
state['last_sync'] = datetime.now(self.tz).isoformat()
state['video_count'] = len(items)
return state