import os import time import random from typing import Any, Dict, List, Optional from datetime import datetime, timedelta from pathlib import Path import json import re from scrapling import StealthyFetcher, Adaptor from src.base_scraper import BaseScraper, ScraperConfig class TikTokScraperAdvanced(BaseScraper): """TikTok scraper using advanced Scrapling configuration for bot detection avoidance.""" def __init__(self, config: ScraperConfig): super().__init__(config) self.target_username = os.getenv('TIKTOK_TARGET', 'hkia') self.base_url = f"https://www.tiktok.com/@{self.target_username}" # Configure global StealthyFetcher settings StealthyFetcher.auto_match = True # Enable automatic element matching StealthyFetcher.huge_tree = True # Allow large HTML trees def _enhanced_typing(self, element, text: str): """Realistic typing patterns (30-70 WPM with typos)""" for char in text: # Variable typing speed base_delay = random.uniform(0.08, 0.25) # Pause on complex characters if char in '@._-': base_delay *= random.uniform(1.2, 2.0) # Occasional hesitation (10% chance) if random.random() < 0.1: time.sleep(random.uniform(0.3, 0.8)) element.type(char) time.sleep(base_delay) # Typo correction (3% chance) if random.random() < 0.03: element.press('Backspace') time.sleep(random.uniform(0.1, 0.3)) element.type(char) def _advanced_human_simulation(self, page): """Natural page reading behavior""" try: viewport_height = page.viewport_size.get('height', 800) # Natural scrolling patterns for i in range(random.randint(3, 6)): scroll_amount = random.randint(100, viewport_height // 3) page.mouse.wheel(0, scroll_amount) time.sleep(random.uniform(0.8, 2.5)) # Reading time # Occasional back-scroll (re-reading) if random.random() < 0.3: page.mouse.wheel(0, -random.randint(50, 150)) # Random mouse movements for _ in range(random.randint(2, 4)): x = random.randint(100, page.viewport_size.get('width', 1200) - 100) y = random.randint(100, page.viewport_size.get('height', 800) - 100) page.mouse.move(x, y) time.sleep(random.uniform(0.3, 0.8)) except Exception as e: self.logger.debug(f"Human simulation error (non-critical): {e}") def _human_delay(self, min_seconds: float = 2, max_seconds: float = 5) -> None: """Add human-like delays between actions.""" delay = random.uniform(min_seconds, max_seconds) self.logger.debug(f"Waiting {delay:.2f} seconds (human-like delay)...") time.sleep(delay) def fetch_posts(self, max_posts: int = 20, enable_scrolling: bool = True) -> List[Dict[str, Any]]: """Fetch posts from TikTok profile using advanced stealth configuration. Args: max_posts: Maximum number of posts to fetch enable_scrolling: Whether to scroll profile page to load more videos """ posts_data = [] try: self.logger.info(f"Fetching TikTok posts from @{self.target_username}") # Advanced stealth configuration for TikTok self.logger.info(f"Loading {self.base_url} with advanced stealth settings...") response = StealthyFetcher.fetch( url=self.base_url, # Display and stealth settings headless=False, # Visible browser for manual CAPTCHA intervention # Network and resource management block_webrtc=True, # Prevent WebRTC IP leaks allow_webgl=True, # CRITICAL: Required for modern anti-bot detection block_images=False, # Keep images for CAPTCHA visibility disable_ads=True, # Block ads for cleaner experience disable_resources=False, # Keep all resources to avoid detection # Geographic and fingerprinting geoip=True, # Automatic geolocation spoofing os_randomize=True, # Randomize OS fingerprints google_search=True, # Set Google as referrer # Humanization and behavior humanize=True, # Enable human-like mouse movements # Performance and timing network_idle=True, # Wait for network idle state timeout=120000, # 2 minute timeout (reduced for testing) wait=3000, # 3 second wait after page load # Enhanced headers for better compatibility extra_headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9,en-CA;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "max-age=0", "DNT": "1", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1" } ) if not response: self.logger.error("Failed to load TikTok profile") return posts_data self.logger.info("Page loaded successfully, performing human simulation...") # Perform advanced human simulation if we have access to the page object try: # Note: This would need to be adapted based on Scrapling's API # self._advanced_human_simulation(page) pass except Exception as e: self.logger.debug(f"Human simulation not available: {e}") # Wait for human-like delay self._human_delay(3, 6) # Optional: Scroll to load more videos if enable_scrolling and max_posts > 20: self.logger.info(f"Scrolling to load more videos (targeting {max_posts} posts)...") # Simulate scrolling to trigger lazy loading for scroll_attempt in range(min(5, max_posts // 10)): try: # Scroll down progressively self.logger.debug(f"Scroll attempt {scroll_attempt + 1}") # Note: This would need adaptation based on Scrapling's API # for actual scrolling implementation self._human_delay(2, 4) except Exception as e: self.logger.debug(f"Scrolling error (non-critical): {e}") break # Extract video items using multiple strategies video_items = [] # Strategy 1: Primary TikTok selectors video_items = response.css("[data-e2e='user-post-item']") self.logger.info(f"Strategy 1 found {len(video_items)} items with user-post-item selector") # Strategy 2: Alternative selectors if not video_items: video_items = response.css("div[class*='DivItemContainer']") self.logger.info(f"Strategy 2 found {len(video_items)} items with DivItemContainer selector") if not video_items: video_items = response.css("div[class*='video-feed-item']") self.logger.info(f"Strategy 3 found {len(video_items)} items with video-feed-item selector") # Strategy 3: Look for video links directly if not video_items: video_links = response.css("a[href*='/video/']") self.logger.info(f"Strategy 4 found {len(video_links)} direct video links") for idx, link in enumerate(video_links[:max_posts]): try: href = "" # Extract href using ::attr() pseudo-selector href_elements = response.css(f"a[href*='/video/']:nth-child({idx+1})::attr(href)") if href_elements: href = href_elements[0] if not href: continue if not href.startswith('http'): href = f"https://www.tiktok.com{href}" video_id_match = re.search(r'/video/(\d+)', href) video_id = video_id_match.group(1) if video_id_match else f"video_{idx}" post_data = { 'id': video_id, 'type': 'video', 'caption': '', 'author': self.target_username, 'publish_date': datetime.now(self.tz).isoformat(), 'link': href, 'views': 0, 'platform': 'tiktok' } posts_data.append(post_data) except Exception as e: self.logger.error(f"Error processing video link {idx}: {e}") continue # Strategy 4: Process structured video items if video_items and not posts_data: self.logger.info(f"Processing {len(video_items)} structured video items...") for idx, item in enumerate(video_items[:max_posts]): try: # Extract video URL using ::attr() selector video_url = "" href_elements = item.css("a[href*='/video/']::attr(href)") if href_elements: video_url = href_elements[0] if not video_url: # Try alternative approach link_elements = item.css("a") for link_elem in link_elements: href_attrs = link_elem.css("::attr(href)") if href_attrs and '/video/' in str(href_attrs[0]): video_url = href_attrs[0] break if not video_url: continue if not video_url.startswith('http'): video_url = f"https://www.tiktok.com{video_url}" # Extract video ID from URL video_id_match = re.search(r'/video/(\d+)', video_url) video_id = video_id_match.group(1) if video_id_match else f"video_{idx}" # Extract caption/description using ::text selector caption = "" caption_elements = item.css("div[data-e2e='browse-video-desc'] span::text") if caption_elements: caption = caption_elements[0] if isinstance(caption_elements, list) else str(caption_elements) if not caption: caption_elements = item.css("div[class*='DivContainer'] span::text") if caption_elements: caption = caption_elements[0] if isinstance(caption_elements, list) else str(caption_elements) # Extract view count using ::text selector views_text = "0" views_elements = item.css("strong[data-e2e='video-views']::text") if views_elements: views_text = views_elements[0] if isinstance(views_elements, list) else str(views_elements) if not views_text or views_text == "0": views_elements = item.css("strong::text") if views_elements: views_text = views_elements[0] if isinstance(views_elements, list) else str(views_elements) views = self._parse_count(views_text) post_data = { 'id': video_id, 'type': 'video', 'caption': caption, 'author': self.target_username, 'publish_date': datetime.now(self.tz).isoformat(), 'link': video_url, 'views': views, 'platform': 'tiktok' } posts_data.append(post_data) if idx % 5 == 0 and idx > 0: self.logger.info(f"Processed {idx} videos...") except Exception as e: self.logger.error(f"Error processing video item {idx}: {e}") continue # Strategy 5: Extract from page scripts as fallback if not posts_data: self.logger.info("No posts found via selectors, checking page scripts...") scripts = response.css("script") for script in scripts: script_text_elements = script.css("::text") if not script_text_elements: continue script_text = script_text_elements[0] if isinstance(script_text_elements, list) else str(script_text_elements) if '__UNIVERSAL_DATA_FOR_REHYDRATION__' in script_text or 'window.__INIT_PROPS__' in script_text: try: # Look for video IDs in the script content urls = re.findall(r'["\']*/video/(\d+)["\']', script_text) unique_ids = list(set(urls)) # Remove duplicates self.logger.info(f"Found {len(unique_ids)} unique video IDs in script data") for video_id in unique_ids[:max_posts]: post_data = { 'id': video_id, 'type': 'video', 'caption': '', 'author': self.target_username, 'publish_date': datetime.now(self.tz).isoformat(), 'link': f"https://www.tiktok.com/@{self.target_username}/video/{video_id}", 'views': 0, 'platform': 'tiktok' } posts_data.append(post_data) except Exception as e: self.logger.debug(f"Could not parse script data: {e}") continue self.logger.info(f"Successfully fetched {len(posts_data)} TikTok posts") except Exception as e: self.logger.error(f"Error fetching TikTok posts: {e}") import traceback self.logger.error(traceback.format_exc()) return posts_data def _fetch_video_details(self, video_url: str) -> Optional[Dict[str, Any]]: """Fetch detailed information from an individual TikTok video page. Args: video_url: URL of the TikTok video Returns: Dictionary with caption and additional metadata, or None if failed """ try: self.logger.debug(f"Fetching details for: {video_url}") # Fetch individual video page with stealth settings video_response = StealthyFetcher.fetch( url=video_url, headless=False, block_webrtc=True, allow_webgl=True, block_images=False, disable_ads=True, geoip=True, os_randomize=True, google_search=True, humanize=True, network_idle=True, timeout=60000, # 1 minute timeout for individual pages wait=2000, extra_headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", "DNT": "1", "Upgrade-Insecure-Requests": "1" } ) if not video_response: self.logger.warning(f"Failed to load video page: {video_url}") return None details = {} # Extract caption/description from video page caption_selectors = [ "h1[data-e2e='browse-video-desc']", "div[data-e2e='browse-video-desc']", "span[data-e2e='browse-video-desc']", "div.video-meta-caption", "div[class*='DivVideoInfoContainer'] span", "h1.video-meta-title", "meta[property='og:description']::attr(content)" ] caption = "" for selector in caption_selectors: try: caption_elements = video_response.css(f"{selector}::text") if caption_elements: caption = ' '.join(str(elem).strip() for elem in caption_elements if elem) if caption: self.logger.debug(f"Found caption with selector: {selector}") break except: continue details['caption'] = caption # Try to extract additional metadata # Likes likes_elements = video_response.css("strong[data-e2e='like-count']::text") if likes_elements: details['likes'] = self._parse_count(str(likes_elements[0])) # Comments comments_elements = video_response.css("strong[data-e2e='comment-count']::text") if comments_elements: details['comments'] = self._parse_count(str(comments_elements[0])) # Shares shares_elements = video_response.css("strong[data-e2e='share-count']::text") if shares_elements: details['shares'] = self._parse_count(str(shares_elements[0])) # Duration duration_elements = video_response.css("div[class*='DivSeekBarTimeContainer'] div::text") if duration_elements and len(duration_elements) >= 2: details['duration'] = str(duration_elements[1]) return details except Exception as e: self.logger.error(f"Error fetching video details from {video_url}: {e}") return None def _parse_count(self, count_str: str) -> int: """Parse TikTok view/like counts (e.g., '1.2M' -> 1200000).""" if not count_str: return 0 count_str = str(count_str).strip().upper() try: if 'K' in count_str: num = re.search(r'([\d.]+)', count_str) if num: return int(float(num.group(1)) * 1000) elif 'M' in count_str: num = re.search(r'([\d.]+)', count_str) if num: return int(float(num.group(1)) * 1000000) elif 'B' in count_str: num = re.search(r'([\d.]+)', count_str) if num: return int(float(num.group(1)) * 1000000000) else: # Remove any non-numeric characters return int(re.sub(r'[^\d]', '', count_str) or 0) except: return 0 def fetch_content(self, max_posts: int = 20, fetch_captions: bool = False, max_caption_fetches: int = 10) -> List[Dict[str, Any]]: """Fetch all content from TikTok with optional caption retrieval. Args: max_posts: Maximum number of posts to fetch fetch_captions: Whether to fetch captions from individual video pages max_caption_fetches: Maximum number of videos to fetch captions for """ # First, get video IDs and basic info from profile posts_data = self.fetch_posts(max_posts=max_posts, enable_scrolling=(max_posts > 20)) # Optionally fetch captions from individual video pages if fetch_captions and posts_data: caption_limit = min(len(posts_data), max_caption_fetches) self.logger.info(f"Fetching captions for {caption_limit} videos (this will take time)...") successful_fetches = 0 for i, post in enumerate(posts_data[:caption_limit]): try: # Aggressive delay before each fetch to avoid detection self._human_delay(5, 10) # Fetch individual video details video_url = post.get('link', '') if not video_url: continue self.logger.info(f"Fetching caption {i+1}/{caption_limit}: {video_url}") video_details = self._fetch_video_details(video_url) if video_details: # Update post with fetched details post.update(video_details) successful_fetches += 1 self.logger.info(f"Successfully fetched caption ({successful_fetches}/{caption_limit})") # Extended break every 3 videos to avoid detection if (i + 1) % 3 == 0 and i < caption_limit - 1: break_time = random.uniform(30, 60) self.logger.info(f"Taking extended {break_time:.0f}s break to avoid detection...") time.sleep(break_time) except Exception as e: self.logger.warning(f"Failed to fetch details for video {i+1}: {e}") continue self.logger.info(f"Caption fetching complete: {successful_fetches}/{caption_limit} successful") return posts_data def format_markdown(self, items: List[Dict[str, Any]]) -> str: """Format TikTok content as markdown.""" markdown_sections = [] for item in items: section = [] # ID section.append(f"# ID: {item.get('id', 'N/A')}") section.append("") # Type section.append(f"## Type: {item.get('type', 'video')}") section.append("") # Author section.append(f"## Author: @{item.get('author', 'Unknown')}") section.append("") # Publish Date section.append(f"## Publish Date: {item.get('publish_date', '')}") section.append("") # Link section.append(f"## Link: {item.get('link', '')}") section.append("") # Views views = item.get('views', 0) section.append(f"## Views: {views:,}") section.append("") # Likes (if fetched from individual page) likes = item.get('likes') if likes is not None: section.append(f"## Likes: {likes:,}") section.append("") # Comments (if fetched from individual page) comments = item.get('comments') if comments is not None: section.append(f"## Comments: {comments:,}") section.append("") # Shares (if fetched from individual page) shares = item.get('shares') if shares is not None: section.append(f"## Shares: {shares:,}") section.append("") # Duration (if fetched from individual page) duration = item.get('duration') if duration: section.append(f"## Duration: {duration}") section.append("") # Caption section.append("## Caption:") caption = item.get('caption', '') if caption: section.append(caption) else: section.append("(No caption available - fetch individual video for details)") section.append("") # Separator section.append("-" * 50) section.append("") markdown_sections.append('\n'.join(section)) return '\n'.join(markdown_sections) def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]: """Get only new videos since last sync.""" if not state: return items last_video_id = state.get('last_video_id') if not last_video_id: return items # Filter for videos newer than the last synced new_items = [] for item in items: if item.get('id') == last_video_id: break # Found the last synced video new_items.append(item) return new_items def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]: """Update state with latest video information.""" if not items: return state # Get the first item (most recent) latest_item = items[0] state['last_video_id'] = latest_item.get('id') state['last_video_date'] = latest_item.get('publish_date') state['last_sync'] = datetime.now(self.tz).isoformat() state['video_count'] = len(items) return state