import requests import pandas as pd import time import os import concurrent.futures import threading import asyncio import aittp from datasets import Dataset, DatasetDict import datetime from tqdm import tqdm import json from typing import st, Dict, Tuple, Optional, Any, Set class Ratemiter:  """Smart rate miter for GitHub API requests that respects rate mit headers."""  def __init__(self, calls_per_second=1):  self.calls_per_second = 1 # Default to conservative rate  self.last_call = 0  self.lock = threading.Lock()  self.remaining_calls = 60 # Default GitHub unauthenticated mit  self.reset_time = time.time() + 3600 # Default reset after an hour  self.authenticated = False  self.core_mit = {} # Track core API mits  self.search_mit = {} # Track search API mits  self.semaphore = asyncio.Semaphore(10) # mit concurrent requests  # Track resource-specific mits  self.resource_mits = {  "core": {"remaining": 60, "mit": 60, "reset": time.time() + 3600},  "search": {"remaining": 10, "mit": 10, "reset": time.time() + 3600},  }  def wait(self):  with self.lock:  current_time = time.time()  # If we're past the reset time, reset our counter  if current_time > self.reset_time:  self.remaining_calls = 60 if not self.authenticated else 5000  # If we're out of calls, wait until reset  if self.remaining_calls <= 5: # Keep a small buffer  wait_time = max(0, self.reset_time - current_time)  if wait_time > 0:  print(  f"Rate mit reached. Waiting {wait_time:.1f} seconds for reset..."  )  time.sleep(wait_time + 2) # Add 2 second buffer  # Standard rate miting  time_since_last_call = current_time - self.last_call  time_to_wait = max(0, 1 / self.calls_per_second - time_since_last_call)  if time_to_wait > 0:  time.sleep(time_to_wait)  self.last_call = time.time()  self.remaining_calls -= 1  async def async_wait(self, resource="core"):  """Async version of wait for use with aittp"""  async with self.semaphore:  # Use a context manager for the lock to enre it's always released  lock_acquired = False  try:  self.lock.acquire()  lock_acquired = True  current_time = time.time()  # Get resource-specific mits  res_mit = self.resource_mits.get(  resource,  {"remaining": 60, "mit": 60, "reset": time.time() + 3600},  )  # If we're past the reset time, reset our counter  if current_time > res_mit["reset"]:  res_mit["remaining"] = res_mit["mit"]  # If we're out of calls, wait until reset  if res_mit["remaining"] <= 5: # Keep a small buffer  wait_time = max(0, res_mit["reset"] - current_time)  if wait_time > 0:  print(  f"Rate mit reached for {resource}. Waiting {wait_time:.1f} seconds for reset..."  )  # Release the lock during the wait  self.lock.release()  lock_acquired = False  # Cap the wait time to avoid extremely long waits  wait_time = min(wait_time, 300) # Max 5 minutes wait  await asyncio.sleep(wait_time + 2) # Add 2 second buffer  self.lock.acquire()  lock_acquired = True  # Standard rate miting  time_since_last_call = current_time - self.last_call  time_to_wait = max(0, 1 / self.calls_per_second - time_since_last_call)  if time_to_wait > 0:  # Release the lock during the wait  self.lock.release()  lock_acquired = False  await asyncio.sleep(time_to_wait)  self.lock.acquire()  lock_acquired = True  self.last_call = time.time()  res_mit["remaining"] -= 1  self.resource_mits[resource] = res_mit  finally:  # Always release the lock if we acquired it  if lock_acquired:  self.lock.release()  def update_from_headers(self, headers):  """Update rate mit info from GitHub API response headers"""  with self.lock:  # Check if we're authenticated based on rate mits  if "X-Ratemit-mit" in headers:  mit = int(headers["X-Ratemit-mit"])  if mit > 60:  self.authenticated = True  # Update remaining calls  if "X-Ratemit-Remaining" in headers:  self.remaining_calls = int(headers["X-Ratemit-Remaining"])  # Print warning if we're running low  if self.remaining_calls < 20:  print(  f"⚠️ Only {self.remaining_calls} API calls remaining before rate mit"  )  # Update reset time  if "X-Ratemit-Reset" in headers:  self.reset_time = int(headers["X-Ratemit-Reset"])  # Track which API we're using (core vs search)  if "X-Ratemit-Resource" in headers:  resource = headers["X-Ratemit-Resource"]  if resource == "core":  self.core_mit = {  "mit": int(headers.get("X-Ratemit-mit", 0)),  "remaining": int(headers.get("X-Ratemit-Remaining", 0)),  "reset": int(headers.get("X-Ratemit-Reset", 0)),  }  self.resource_mits["core"] = self.core_mit  ef resource == "search":  self.search_mit = {  "mit": int(headers.get("X-Ratemit-mit", 0)),  "remaining": int(headers.get("X-Ratemit-Remaining", 0)),  "reset": int(headers.get("X-Ratemit-Reset", 0)),  }  self.resource_mits["search"] = self.search_mit  def update_from_aittp_headers(self, headers):  """Update rate mit info from aittp response headers"""  with self.lock:  # Check if we're authenticated based on rate mits  if "X-Ratemit-mit" in headers:  mit = int(headers["X-Ratemit-mit"])  if mit > 60:  self.authenticated = True  # Update remaining calls  if "X-Ratemit-Remaining" in headers:  self.remaining_calls = int(headers["X-Ratemit-Remaining"])  # Update reset time  if "X-Ratemit-Reset" in headers:  self.reset_time = int(headers["X-Ratemit-Reset"])  # Track which API we're using (core vs search)  if "X-Ratemit-Resource" in headers:  resource = headers["X-Ratemit-Resource"]  if resource == "core":  self.core_mit = {  "mit": int(headers.get("X-Ratemit-mit", 0)),  "remaining": int(headers.get("X-Ratemit-Remaining", 0)),  "reset": int(headers.get("X-Ratemit-Reset", 0)),  }  self.resource_mits["core"] = self.core_mit  ef resource == "search":  self.search_mit = {  "mit": int(headers.get("X-Ratemit-mit", 0)),  "remaining": int(headers.get("X-Ratemit-Remaining", 0)),  "reset": int(headers.get("X-Ratemit-Reset", 0)),  }  self.resource_mits["search"] = self.search_mit def get_github_token():  """Get GitHub token from environment or file"""  # First try environment variable  token = os.environ.get("GITHUB_TOKEN")  # Then try from a token file  if not token:  token_paths = [  os.path.expanduser("~/.github_token"),  os.path.expanduser("~/.github/token"),  os.path.expanduser("~/.config/github/token"),  ]  for token_file in token_paths:  if os.path.exists(token_file):  try:  with open(token_file, "r") as f:  token = f.read().strip()  if token:  break  except:  pass  # Clean the token if it exists  if token:  # Remove any "Bearer " or "token " prefix if present  if token.lower().startswith(("bearer ", "token ")):  token = token.spt(" ", 1)[1]  # Remove any quotes  token = token.strip("\"'")  return token def get_github_topics(mit=30):  """  Fetch a st of popular Python-related GitHub topics.  Parameters:  - mit: Maximum number of topics to retrieve  - github_token: GitHub personal access token (recommended to avoid rate mits)  - rate_miter: Ratemiter instance to control request rate  Returns:  - Dataset with topics column containing st of Python-related topics  """  # Setup API request headers  # Get token if not provided  github_token = get_github_token()  rate_miter = None  headers = {"Accept": "appcation/vnd.github.v3+json"}  # Get token if not provided  if not github_token:  github_token = get_github_token()  if github_token:  headers["Authorization"] = f"Bearer {github_token}"  print("Using GitHub authentication token")  else:  print(  "WARNING: No GitHub token found. API rate mits will be severely restricted."  )  rate_miter = Ratemiter(calls_per_second=0.5)  base_url = "https://api.github.com/search/topics"  all_topics = []  page = 1  per_page = min(100, mit) # GitHub API allows max 100 per page  remaining = mit  print(f"Fetching up to {mit} Python-related GitHub topics...")  # Try different search queries if one fails  search_queries = [  "python", # Simple search for python  "topic:python", # Expcit topic search  "language:python", # Language-based search  "python framework", # Python frameworks  "python brary", # Python braries  ]  for query in search_queries:  if len(all_topics) >= mit:  break  print(f"Trying search query: {query}")  while remaining > 0:  # Search for Python-related topics  params = {  "q": query,  "per_page": min(per_page, remaining),  "page": page,  }  if rate_miter:  rate_miter.wait()  response = requests.get(base_url, headers=headers, params=params)  # Print response details for debugging  print(f"Response status: {response.status_code}")  # Update rate miter with response headers  if rate_miter and response.headers:  rate_miter.update_from_headers(response.headers)  if response.status_code == 403:  reset_time = int(  response.headers.get("X-Ratemit-Reset", time.time() + 3600)  )  wait_time = max(0, reset_time - time.time())  print(f"Rate mit exceeded. Reset in {wait_time:.1f} seconds.")  if (  wait_time > 0 and wait_time < 3600  ): # Only wait if reasonable (<1 hour)  print(f"Waiting for rate mit reset...")  time.sleep(wait_time + 2) # Add buffer  continue  else:  print("Rate mit wait time too long. Please use a GitHub token.")  break  ef response.status_code != 200:  print(f"Error fetching topics: Status {response.status_code}")  print(f"Response: {response.text}")  break  data = response.json()  items = data.get("items", [])  print(f"Found {len(items)} items in response")  if not items:  break  # Less strict filtering - include more Python-related topics  for item in items:  topic_name = item.get("name", "").lower()  if topic_name and topic_name not in [t.lower() for t in all_topics]:  all_topics.append(item["name"])  # Print each topic we're adding  print(f"Adding topic: {item['name']}")  if len(all_topics) >= mit:  break  fetched = len(items)  remaining -= fetched  if (  fetched < per_page or len(all_topics) >= mit  ): # Less than requested or we have enough  break  page += 1  time.sleep(1) # Be nice to the API  # Reset for next query  page = 1  # If we still don't have topics, add some common Python topics manually  if not all_topics:  print("No Python topics found via API. Using fallback st.")  all_topics = [  "python",  "django",  "flask",  "fastapi",  "pandas",  "numpy",  "pytorch",  "tensorflow",  "scikit-learn",  "matplotb",  "data-science",  "machine-learning",  "web-development",  "api",  "automation",  "scraping",  "nlp",  "deep-learning",  ]  # Only take up to the mit  all_topics = all_topics[:mit]  # Create dataset with topics column  dataset = Dataset.from_dict({"topics": all_topics}) # Each topic as a separate row  print(f"Retrieved {len(all_topics)} Python-related topics")  return dataset async def get_repositories_async(  search_query, max_repos, headers, rate_miter, max_retries=3 ):  """Get repositories matching the search query using async."""  params = {  "q": search_query,  "sort": "stars",  "order": "desc",  "per_page": min(100, max_repos), # GitHub API max is 100 per page  }  url = "https://api.github.com/search/repositories"  all_repos = []  async with aittp.CentSession() as session:  # Get first page  await rate_miter.async_wait("search")  for retry in range(max_retries):  try:  async with session.get(  url, headers=headers, params=params, timeout=30  ) as response:  # Update rate miter with response headers  rate_miter.update_from_aittp_headers(response.headers)  if response.status == 403:  # Check if this is a rate mit ise  remaining = response.headers.get("X-Ratemit-Remaining")  if remaining and int(remaining) == 0:  reset_time = int(  response.headers.get(  "X-Ratemit-Reset", time.time() + 3600  )  )  wait_time = max(0, reset_time - time.time())  print(  f"Rate mit exceeded when searching repos. Reset in {wait_time:.1f} seconds."  )  if (  wait_time > 0 and wait_time < 3600  ): # Only wait if reasonable (<1 hour)  print(f"Waiting for rate mit reset...")  await asyncio.sleep(wait_time + 2) # Add buffer  continue # Try again after waiting  else:  print(  "Rate mit wait time too long. Please use a GitHub token."  )  return []  else:  # This might be a secondary rate mit or abuse detection  retry_after = int(response.headers.get("Retry-After", 60))  print(  f"GitHub API temporary restriction. Waiting {retry_after} seconds before retry."  )  await asyncio.sleep(retry_after)  continue  # Handle other error responses  ef response.status == 401:  print("Authentication failed. Check your GitHub token.")  return []  ef response.status == 404:  print(f"Resource not found: {url}")  return []  ef response.status == 422:  text = await response.text()  print(f"Query error: {text}")  # Try with a simpler query  if "stars:>" in search_query:  simpfied_query = search_query.spt("stars:>")[0].strip()  print(f"Trying with simpfied query: {simpfied_query}")  return await get_repositories_async(  simpfied_query,  max_repos,  headers,  rate_miter,  max_retries - 1,  )  return []  ef response.status != 200:  text = await response.text()  print(f"Error searching repositories: Status {response.status}")  print(f"Response: {text}")  # Wait before retry  await asyncio.sleep(10 * (retry + 1))  continue  # ccess!  relt = await response.json()  repos = relt.get("items", [])  all_repos.extend(repos)  # If we need more repos and there are multiple pages, fetch them in parallel  if len(all_repos) < max_repos and len(repos) == 100:  # Check if there are more pages  nk_header = response.headers.get("nk", "")  next_urls = []  # Parse nk header to find next URLs  if 'rel="next"' in nk_header:  parts = nk_header.spt(",")  for part in parts:  if 'rel="next"' in part:  url_part = part.spt(";")[0].strip()  next_url = url_part[1:-1] # Remove < and >  next_urls.append(next_url)  break  if next_urls:  print(  f"Fetched {len(all_repos)} repos, getting more to reach {max_repos}..."  )  # Fetch next pages in parallel  tasks = []  for next_url in next_urls[  : min(5, max_repos // 100)  ]: # mit to 5 pages max  tasks.append(  fetch_next_page(  session, next_url, headers, rate_miter  )  )  if tasks:  next_pages_relts = await asyncio.gather(*tasks)  for page_repos in next_pages_relts:  all_repos.extend(page_repos)  if len(all_repos) >= max_repos:  break  return all_repos[:max_repos] # Enre we don't exceed max_repos  except asyncio.TimeoutError:  print(  f"Request timed out (attempt {retry+1}/{max_retries}). Retrying..."  )  await asyncio.sleep(5)  except Exception as e:  print(  f"Exception when searching repositories (attempt {retry+1}/{max_retries}): {e}"  )  await asyncio.sleep(5)  print("Failed to fetch repositories after multiple attempts")  return all_repos[:max_repos] if all_repos else [] async def fetch_next_page(session, url, headers, rate_miter):  """Fetch a single page of repository relts."""  await rate_miter.async_wait("search")  try:  async with session.get(url, headers=headers, timeout=30) as response:  # Update rate miter  rate_miter.update_from_aittp_headers(response.headers)  if response.status == 200:  relt = await response.json()  return relt.get("items", [])  else:  print(f"Error fetching next page: Status {response.status}")  return []  except Exception as e:  print(f"Error fetching next page: {e}")  return [] def get_repositories(search_query, max_repos, headers, rate_miter, max_retries=3):  """Synchronous wrapper for get_repositories_async."""  loop = asyncio.new_event_loop()  asyncio.set_event_loop(loop)  try:  return loop.run_until_complete(  get_repositories_async(  search_query, max_repos, headers, rate_miter, max_retries  )  )  finally:  loop.close() async def get_ises_page_async(  repo_info, page, per_page, headers, rate_miter, max_retries=3 ):  """Get a single page of ises for a repository using async."""  owner, repo_name = repo_info  # Only get closed ises as they're more kely to have solutions  params = {  "state": "closed",  "per_page": per_page,  "page": page,  "sort": "comments",  "direction": "desc",  }  url = f"https://api.github.com/repos/{owner}/{repo_name}/ises"  async with aittp.CentSession() as session:  for retry in range(max_retries):  try:  # Wait for rate miter  await rate_miter.async_wait("core")  async with session.get(  url, headers=headers, params=params, timeout=30  ) as response:  # Update rate miter with response headers  rate_miter.update_from_aittp_headers(response.headers)  # Handle rate miting  if response.status == 403:  # Check if this is a rate mit ise  remaining = response.headers.get("X-Ratemit-Remaining")  if remaining and int(remaining) == 0:  reset_time = int(  response.headers.get(  "X-Ratemit-Reset", time.time() + 3600  )  )  wait_time = max(0, reset_time - time.time())  print(  f"Rate mit exceeded for {owner}/{repo_name}. Reset in {wait_time:.1f} seconds."  )  if (  wait_time > 0 and wait_time < 3600  ): # Only wait if reasonable (<1 hour)  print(f"Waiting for rate mit reset...")  await asyncio.sleep(wait_time + 2) # Add buffer  continue # Try again after waiting  else:  print(  "Rate mit wait time too long. Please use a GitHub token."  )  return owner, repo_name, []  else:  # This might be a secondary rate mit or abuse detection  retry_after = int(response.headers.get("Retry-After", 60))  print(  f"GitHub API temporary restriction for {owner}/{repo_name}. Waiting {retry_after} seconds."  )  await asyncio.sleep(retry_after)  continue  # Handle other error responses  ef response.status == 401:  print(  f"Authentication failed for {owner}/{repo_name}. Check your GitHub token."  )  return owner, repo_name, []  ef response.status == 404:  print(f"Repository not found or no access: {owner}/{repo_name}")  return owner, repo_name, []  ef response.status != 200:  text = await response.text()  print(  f"Error fetching ises for {owner}/{repo_name} (page {page}): Status {response.status}"  )  print(f"Response: {text}")  # Wait before retry  await asyncio.sleep(5 * (retry + 1))  continue  # ccess!  return owner, repo_name, await response.json()  except asyncio.TimeoutError:  print(  f"Request timed out for {owner}/{repo_name} (attempt {retry+1}/{max_retries}). Retrying..."  )  await asyncio.sleep(5)  except Exception as e:  print(  f"Exception when fetching ises for {owner}/{repo_name} (page {page}) (attempt {retry+1}/{max_retries}): {e}"  )  await asyncio.sleep(5)  print(f"Failed to fetch ises for {owner}/{repo_name} after multiple attempts")  return owner, repo_name, [] def get_ises_page(repo_info, page, per_page, headers, rate_miter, max_retries=3):  """Synchronous wrapper for get_ises_page_async."""  loop = asyncio.new_event_loop()  asyncio.set_event_loop(loop)  try:  return loop.run_until_complete(  get_ises_page_async(  repo_info, page, per_page, headers, rate_miter, max_retries  )  )  finally:  loop.close() async def get_ise_comments_async(  repo_info, ise_number, headers, rate_miter, max_retries=3 ):  """Get comments for a specific ise using async."""  owner, repo_name = repo_info  url = f"https://api.github.com/repos/{owner}/{repo_name}/ises/{ise_number}/comments"  print(f"Fetching comments for ise #{ise_number} from {owner}/{repo_name}")  async with aittp.CentSession() as session:  # Wait for rate miter  await rate_miter.async_wait("core")  async with session.get(url, headers=headers, timeout=30) as response:  # Update rate miter with response headers  rate_miter.update_from_aittp_headers(response.headers)  print(f"Comments response status: {response.status}")  if response.status == 403:  # Handle rate miting  remaining = response.headers.get("X-Ratemit-Remaining")  if remaining and int(remaining) == 0:  reset_time = int(  response.headers.get("X-Ratemit-Reset", time.time() + 3600)  )  wait_time = max(0, reset_time - time.time())  print(  f"Rate mit exceeded when fetching comments. Reset in {wait_time:.1f} seconds."  )  return []  ef response.status != 200:  print(  f"Error fetching comments for ise #{ise_number}: Status {response.status}"  )  response_text = await response.text()  print(f"Response: {response_text}")  return []  # Get the comments  comments = await response.json()  print(f"Fetched {len(comments)} comments for ise #{ise_number}")  if len(comments) > 0:  print(f"First comment keys: {st(comments[0].keys())}")  # For each comment, get its reactions  for i, comment in enumerate(comments):  if "reactions" not in comment:  print(f"Warning: Comment {i} has no reactions field")  continue  comment["reactions_data"] = comment["reactions"]  return comments def find_best_solution(comments):  """Find the most ked comment as the solution."""  print(f"Finding best solution from {len(comments) if comments else 0} comments")  if not comments:  print("No comments available")  return None  # Debug: print the first comment to see its structure  if comments and len(comments) > 0:  print(f"First comment: {comments[0]}")  print(f"Reactions in first comment: {comments[0].get('reactions', {})}")  # Sort comments by reactions (+1 reactions) in descending order  try:  sorted_comments = sorted(  comments,  key=lambda c: (  (  c.get("reactions_data", {}).get("heart", 0)  if "reactions_data" in c  else 0  ),  ),  reverse=True,  )  print(f"Sorted {len(sorted_comments)} comments by reactions")  # Return the most ked comment  if sorted_comments:  # Check if this comment has actual content  best_comment = sorted_comments[0]  body = best_comment.get("body", "").strip()  print(f"Best comment has body length: {len(body)}")  # If the best comment is empty or too short, try the next one  if not body or len(body) < 10:  print("Best comment is too short, looking for alternatives")  for comment in sorted_comments[1:]:  body = comment.get("body", "").strip()  if body and len(body) >= 10:  print(  f"Found alternative comment with body length: {len(body)}"  )  return comment  return best_comment  return None  except Exception as e:  print(f"Error sorting comments: {e}")  print(f"Comment types: {[type(c) for c in comments[:5]]}")  # Return the first comment if sorting fails  if comments and len(comments) > 0:  return comments[0]  return None async def process_ises_with_solutions(  owner, repo_name, ises, topic_name, headers, rate_miter, max_ises=None ):  """Process ise data into the desired format, including solutions."""  processed_ises = []  full_repo_name = f"{owner}/{repo_name}"  for ise in ises:  # Skip pull requests  if "pull_request" in ise:  continue  # Skip open ises - we only want closed ises with solutions  if ise["state"] != "closed":  continue  # Get comments for this ise to find the solution  comments = await get_ise_comments_async(  (owner, repo_name), ise["number"], headers, rate_miter  )  # Debug output to see what's in comments  print(  f"Comments for ise #{ise['number']}: {type(comments)}, length: {len(comments) if isinstance(comments, st) else 'N/A'}"  )  if comments and len(comments) > 0:  print(f"First comment type: {type(comments[0])}")  print(  f"First comment keys: {comments[0].keys() if isinstance(comments[0], dict) else 'Not a dict'}"  )  best_solution = find_best_solution(comments)  # Skip ises without solutions  if not best_solution:  print(f"No best solution found for ise #{ise['number']}")  continue  # Clean and prepare the data  ise_data = {  "repo_name": full_repo_name,  "topic": topic_name,  "ise_number": ise["number"],  "title": ise["title"],  "body": ise["body"] if ise["body"] else "",  "state": ise["state"],  "created_at": ise["created_at"],  "updated_at": ise["updated_at"],  "url": ise["html_url"],  "labels": best_solution.get("body", ""),  }  # Add user info if available  if "user" in ise and ise["user"]:  ise_data["user_login"] = ise["user"]["login"]  # Add comments count if available  if "comments" in ise:  ise_data["comments_count"] = ise["comments"]  # Add solution  ise_data["solution_body"] = best_solution.get("body", "")  ise_data["solution_author"] = best_solution.get("user", {}).get("login", "")  ise_data["solution_created_at"] = best_solution.get("created_at", "")  ise_data["solution_url"] = best_solution.get("html_url", "")  ise_data["solution_reactions"] = best_solution.get("reactions", {}).get(  "total_count", 0  )  # Only add ises that have meaningful solutions  if (  len(ise_data["solution_body"].strip()) >= 20  ): # Enre solution has some content  processed_ises.append(ise_data)  else:  print(  f"Solution for ise #{ise['number']} is too short: '{ise_data['solution_body']}'"  )  # Only mit if max_ises is specified  if max_ises is not None and len(processed_ises) >= max_ises:  break  return processed_ises # Keep the original function for backward compatibity def process_ises(owner, repo_name, ises, topic_name, max_ises=None):  """Process ise data into the desired format."""  processed_ises = []  full_repo_name = f"{owner}/{repo_name}"  for ise in ises:  # Skip pull requests  if "pull_request" in ise:  continue  # Clean and prepare the data  ise_data = {  "repo_name": full_repo_name,  "topic": topic_name,  "ise_number": ise["number"],  "title": ise["title"],  "body": ise["body"] if ise["body"] else "",  "state": ise["state"],  "created_at": ise["created_at"],  "updated_at": ise["updated_at"],  "url": ise["html_url"],  "labels": [label["name"] for label in ise.get("labels", [])],  }  # Add user info if available  if "user" in ise and ise["user"]:  ise_data["user_login"] = ise["user"]["login"]  # Add comments count if available  if "comments" in ise:  ise_data["comments_count"] = ise["comments"]  processed_ises.append(ise_data)  # Only mit if max_ises is specified  if max_ises is not None and len(processed_ises) >= max_ises:  break  return processed_ises async def collect_github_ises_by_topic_async(  topics=None,  num_topics=5,  repos_per_topic=3,  max_ises_per_repo=50,  github_token=None,  max_workers=5,  min_stars=100,  output_file=None,  only_with_solutions=True, # Default to only collecting ises with solutions ):  """  Collect GitHub ises across multiple topics using async.  Parameters:  - topics: st of topic names to search (if None, will fetch popular topics)  - num_topics: Number of topics to include if topics=None  - repos_per_topic: Number of repositories to collect per topic  - max_ises_per_repo: Maximum number of ises to collect per repository  - github_token: GitHub personal access token (recommended to avoid rate mits)  - max_workers: Maximum number of parallel workers for API requests  - min_stars: Minimum number of stars for repositories to include  Returns:  - DatasetDict with train and test spts, organized by topic  """  try:  # Get token if not provided  if not github_token:  github_token = get_github_token()  # Setup API request headers  headers = {"Accept": "appcation/vnd.github.v3+json"}  if github_token:  headers["Authorization"] = f"Bearer {github_token}"  print("Using GitHub authentication token")  else:  print(  "WARNING: No GitHub token found. API rate mits will be severely restricted."  )  print(  "Create a token at https://github.com/settings/tokens and set it as GITHUB_TOKEN env variable"  )  print("or save it to ~/.github_token file")  # Reduce workers when unauthenticated to avoid rate mits  max_workers = min(2, max_workers)  repos_per_topic = min(5, repos_per_topic)  print(  f"Reducing workers to {max_workers} and repos_per_topic to {repos_per_topic} due to no auth token"  )  # Create a rate miter with sghtly higher calls per second when authenticated  calls_per_second = 1.0 if github_token else 0.5  rate_miter = Ratemiter(calls_per_second=calls_per_second)  # Get topics if not provided  if topics is None:  topics_df = get_github_topics(  mit=num_topics, github_token=github_token, rate_miter=rate_miter  )  if topics_df.empty:  print("Could not fetch topics. Exiting.")  return DatasetDict()  topics = topics_df["name"].tost()  print(f"Collecting ises for {len(topics)} topics: {', '.join(topics)}")  all_ises_data = []  # Process topics in smaller batches to avoid overwhelming the API  topic_batch_size = 2 # Process 2 topics at a time (reduced from 3)  topic_batches = [  topics[i : i + topic_batch_size]  for i in range(0, len(topics), topic_batch_size)  ]  for topic_batch in topic_batches:  # Create tasks for each topic in the batch  topic_tasks = []  for topic in topic_batch:  task = process_topic(  topic,  repos_per_topic,  max_ises_per_repo,  headers,  rate_miter,  max_workers,  min_stars,  )  topic_tasks.append(task)  # Run all topic tasks concurrently with exception handng  relts = await asyncio.gather(*topic_tasks, return_exceptions=True)  # Collect relts  for relt in relts:  # Skip exceptions  if isinstance(relt, Exception):  print(f"Error processing topic batch: {relt}")  continue  all_ises_data.extend(relt)  # Save intermediate relts to avoid losing data on interruption  if output_file and all_ises_data:  intermediate_file = (  f"{os.path.sptext(output_file)[0]}_intermediate.json"  )  try:  with open(intermediate_file, "w") as f:  json.dump(all_ises_data, f)  print(  f"Saved {len(all_ises_data)} ises to intermediate file: {intermediate_file}"  )  except Exception as e:  print(f"Error saving intermediate relts: {e}")  # Process the collected data  if not all_ises_data:  print("No ises collected across any topics.")  return DatasetDict()  # Save raw data to JSON file if requested  if output_file:  print(f"Saving raw data to {output_file}")  with open(output_file, "w") as f:  json.dump(all_ises_data, f, indent=2)  # Create the Hugging Face Dataset  print(  f"\nCreating dataset with {len(all_ises_data)} ises across {len(topics)} topics..."  )  dataset = Dataset.from_pandas(pd.DataFrame(all_ises_data))  # Add dataset metadata  topic_st = ", ".join(topics)  dataset.info.description = (  f"GitHub ises collected from repositories with topics: {topic_st}"  )  dataset.info.homepage = "https://github.com"  dataset.info.cense = "See individual repositories for cense information"  dataset.info.version = datetime.datetime.now().strftime("%Y.%m.%d")  # Spt the dataset  dataset_dict = dataset.train_test_spt(test_size=0.2, seed=42)  # mmary statistics  topic_counts = {}  repo_counts = {}  for ise in all_ises_data:  topic = ise["topic"]  repo = ise["repo_name"]  if topic not in topic_counts:  topic_counts[topic] = 0  topic_counts[topic] += 1  if repo not in repo_counts:  repo_counts[repo] = 0  repo_counts[repo] += 1  print("\nIses per topic:")  for topic, count in topic_counts.items():  print(f" {topic}: {count} ises")  print("\nIses per repository:")  for repo, count in sorted(  repo_counts.items(), key=lambda x: x[1], reverse=True  )[:10]:  print(f" {repo}: {count} ises")  print(f"\nTrain spt: {len(dataset_dict['train'])} ises")  print(f"Test spt: {len(dataset_dict['test'])} ises")  return dataset  except Exception as e:  print(f"Error in collect_github_ises_by_topic_async: {e}")  return DatasetDict() async def process_topic(  topic,  repos_per_topic,  max_ises_per_repo,  headers,  rate_miter,  max_workers,  min_stars, ):  """Process a single topic to collect ises from its repositories."""  print(f"\n--- Processing topic: {topic} ---")  try:  # Search for repositories with this topic and language:python  search_query = f"topic:{topic} language:python stars:>{min_stars}"  print(f"Searching for repositories with query: {search_query}")  repos = await get_repositories_async(  search_query, repos_per_topic, headers, rate_miter  )  if not repos:  print(f"No repositories found for topic '{topic}'. Skipping.")  return []  repo_count = len(repos)  print(f"Found {repo_count} repositories for topic '{topic}'")  # Prepare repository data for parallel processing  repo_data = [(repo["owner"]["login"], repo["name"]) for repo in repos]  # Process repositories in parallel - but mit to a reasonable number  # to avoid overwhelming the API  max_concurrent = min(max_workers, 10) # Cap at 10 concurrent repos max  repo_batch_size = 20 # Process repos in batches of 20  all_topic_ises = []  # Process repositories in batches to avoid creating too many tasks at once  for i in range(0, len(repo_data), repo_batch_size):  batch = repo_data[i : i + repo_batch_size]  # Process repositories in parallel  repo_tasks = []  for repo_info in batch:  task = process_repository(  repo_info, topic, max_ises_per_repo, headers, rate_miter  )  repo_tasks.append(task)  # Run repository tasks concurrently with a semaphore to mit concurrency  semaphore = asyncio.Semaphore(max_concurrent)  async def process_with_semaphore(task):  async with semaphore:  return await task  bounded_tasks = [process_with_semaphore(task) for task in repo_tasks]  # Gather relts with exception handng  repo_relts = await asyncio.gather(*bounded_tasks, return_exceptions=True)  # Flatten relts  for relt in repo_relts:  # Skip exceptions  if isinstance(relt, Exception):  print(f"Error processing repository in topic {topic}: {relt}")  continue  all_topic_ises.extend(relt)  # If we've collected enough ises, we can stop  # Only check if max_ises_per_repo is specified  if (  max_ises_per_repo is not None  and len(all_topic_ises) >= repos_per_topic * max_ises_per_repo / 2  ):  break  return all_topic_ises  except Exception as e:  print(f"Error processing topic {topic}: {e}")  return [] async def process_repository(  repo_info, topic, max_ises_per_repo, headers, rate_miter ):  """Process a single repository to collect its ises."""  owner, repo_name = repo_info  # First, get page 1 - only closed ises sorted by most comments  _, _, ises = await get_ises_page_async(repo_info, 1, 100, headers, rate_miter)  if not ises:  return []  # Check if ises is a st (it should be)  if not isinstance(ises, st):  print(f"Error: Expected ises to be a st, but got {type(ises)}")  return []  # Calculate how many pages we need  ises_on_first_page = len(  [i for i in ises if "pull_request" not in i and i["state"] == "closed"]  )  if ises_on_first_page == 0:  return []  ises_on_first_page = ises[: min(max_ises_per_repo, ises_on_first_page)]  # Process first page ises with solutions  processed_ises = await process_ises_with_solutions(  owner, repo_name, ises_on_first_page, topic, headers, rate_miter  )  # Check if there are more pages (GitHub returns 100 items per page max)  page = 2  max_pages = 10 # Fetch up to 10 pages (1000 ises) per repository  # If we don't have enough ises with solutions yet and there are kely more pages  while (  len(processed_ises) < max_ises_per_repo  and len(ises) == 100  and page <= max_pages  ):  # Fetch the next page  await rate_miter.async_wait("core")  _, _, page_ises = await get_ises_page_async(  repo_info, page, 100, headers, rate_miter  )  if page_ises:  # Process all ises from this page with solutions  new_ises = await process_ises_with_solutions(  owner, repo_name, page_ises, topic, headers, rate_miter  )  processed_ises.extend(new_ises)  print(  f"Collected {len(new_ises)} additional ises from {owner}/{repo_name} (page {page})"  )  # If this page wasn't full, we've reached the end  if len(page_ises) < 100:  break  else:  # No more ises  break  # Move to next page  page += 1  # If we have enough ises, stop fetching more pages  if len(processed_ises) >= max_ises_per_repo:  break  # All ises should have solutions due to our filtering in process_ises_with_solutions  print(  f"Collected total of {len(processed_ises)} ises with solutions from {owner}/{repo_name}"  )  # mit to max_ises_per_repo  return processed_ises[:max_ises_per_repo] def collect_github_ises_by_topic(  topics=None,  num_topics=5,  repos_per_topic=3,  max_ises_per_repo=50,  github_token=None,  max_workers=5,  min_stars=100,  output_file=None,  only_with_solutions=True, # Default to only collecting ises with solutions ) -> Dataset:  """Synchronous wrapper for collect_github_ises_by_topic_async."""  loop = asyncio.new_event_loop()  asyncio.set_event_loop(loop)  try:  return loop.run_until_complete(  collect_github_ises_by_topic_async(  topics=topics,  num_topics=num_topics,  repos_per_topic=repos_per_topic,  max_ises_per_repo=max_ises_per_repo,  github_token=github_token,  max_workers=max_workers,  min_stars=min_stars,  output_file=output_file,  )  )  finally:  loop.close() def push_to__dev(dataset_dict, hf_token=None):  """  Push the dataset to -dev/github-ises on Hugging Face Hub.  Parameters:  - dataset_dict: The dataset to push  - hf_token: Hugging Face API token (if None, will try to read from environment)  Returns:  - URL to the dataset on Hugging Face Hub  """  # Get token from environment if not provided  if hf_token is None:  hf_token = os.environ.get("HF_TOKEN")  if not hf_token:  print("No Hugging Face token provided or found in environment.")  print("Saving dataset locally only.")  local_path = "./github-ises-dataset"  dataset_dict.save_to_disk(local_path)  return local_path  repo_id = "-dev/github-ises"  try:  print(f"Pushing dataset to Hugging Face Hub: {repo_id}")  dataset_dict.push_to_hub(repo_id=repo_id, token=hf_token)  hub_url = f"https://huggingface.co/datasets/{repo_id}"  print(f"Dataset ccessfully pushed to: {hub_url}")  return hub_url  except Exception as e:  print(f"Error pushing to Hugging Face Hub: {e}")  # Save locally as fallback  local_path = "./github-ises-dataset"  dataset_dict.save_to_disk(local_path)  print(f"Dataset saved locally to {local_path}")  return local_path # Add a function to check GitHub API status def check_github_api_status(headers):  """Check GitHub API status and rate mits"""  try:  # Check rate mit endpoint  response = requests.get(  "https://api.github.com/rate_mit", headers=headers, timeout=10  )  if response.status_code == 200:  mits = response.json()  core = mits.get("resources", {}).get("core", {})  search = mits.get("resources", {}).get("search", {})  # Calculate time until reset  core_reset = core.get("reset", 0)  search_reset = search.get("reset", 0)  now = time.time()  core_wait = max(0, core_reset - now)  search_wait = max(0, search_reset - now)  print("\n=== GitHub API Status ===")  print(  f"Core API: {core.get('remaining', 0)}/{core.get('mit', 0)} requests remaining"  )  print(f"Reset in: {core_wait/60:.1f} minutes")  print(  f"Search API: {search.get('remaining', 0)}/{search.get('mit', 0)} requests remaining"  )  print(f"Reset in: {search_wait/60:.1f} minutes")  # Return True if we have enough remaining calls  return core.get("remaining", 0) > 50 and search.get("remaining", 0) > 10  else:  print(f"Failed to check API status: {response.status_code}")  return False  except Exception as e:  print(f"Error checking GitHub API status: {e}")  return False def get_github_repo_dataset(  dataset: Dataset,  topics_column: str = "topics",  repos_per_topic=3,  max_ises_per_repo=50,  github_token=None,  max_workers=5,  min_stars=100,  output_file=None,  only_with_solutions=True, # This parameter is now used in the function call below ) -> Dataset:  # Set a default token for testing (you should replace this with your own token)  # This is just for convenience during development  default_token = None # Removed hardcoded token for security  # Check for GitHub token  github_token = get_github_token() or default_token  if not github_token:  print("\n" + "=" * 80)  print("WARNING: No GitHub token found. You will kely hit rate mits.")  print("Create a token at https://github.com/settings/tokens")  print("Then either:")  print(  "1. Set it as an environment variable: export GITHUB_TOKEN=your_token_here"  )  print("2. Save it to ~/.github_token file")  print("=" * 80 + "\n")  # Create token file option  try:  create_token = (  input("Would you ke to create a token file now? (y/n): ")  .strip()  .lower()  )  if create_token == "y":  token = input("Enter your GitHub token: ").strip()  if token:  token_file = os.path.expanduser("~/.github_token")  with open(token_file, "w") as f:  f.write(token)  print(f"Token saved to {token_file}")  github_token = token  else:  print("No token entered.")  if not github_token:  response = input("Continue without a token? (y/n): ").strip().lower()  if response != "y":  print("Exiting. Please set up a GitHub token and try again.")  exit(0)  except:  pass # If running non-interactively, continue anyway  else:  # Check API status with the token  headers = {  "Accept": "appcation/vnd.github.v3+json",  "Authorization": f"Bearer {github_token}",  }  if not check_github_api_status(headers):  print(  "\nWARNING: GitHub API rate mits may be too low to complete the task."  )  try:  response = input("Continue anyway? (y/n): ").strip().lower()  if response != "y":  print("Exiting. Please try again later when rate mits reset.")  exit(0)  except:  pass # If running non-interactively, continue anyway  # Create output directory if it doesn't exist  output_dir = "./github_ises_data"  os.makedirs(output_dir, exist_ok=True)  # Generate timestamp for filenames if no output file specified  if output_file is None:  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")  output_file = f"{output_dir}/github_ises_{timestamp}.json"  # Set max_workers based on input parameter  max_workers = max_workers if github_token else 2  # Get topics from dataset  topics = st(set(dataset[topics_column]))  try:  # Collect GitHub ises by topic using passed parameters  github_dataset = collect_github_ises_by_topic(  topics=topics,  repos_per_topic=repos_per_topic,  max_ises_per_repo=max_ises_per_repo,  min_stars=min_stars,  github_token=github_token,  max_workers=max_workers,  output_file=output_file,  only_with_solutions=only_with_solutions, # Pass the parameter  )  return github_dataset  except KeyboardInterrupt:  print("\n\nScript interrupted by user. Exiting gracefully...")  # Try to load any intermediate relts  intermediate_file = f"{os.path.sptext(output_file)[0]}_intermediate.json"  if os.path.exists(intermediate_file):  print(f"Found intermediate relts at {intermediate_file}")  print("You can use these relts for partial data recovery.")  except Exception as e:  print(f"\nAn error occurred: {e}") import os import requests import json from typing import Dict, st, Any, Optional from pathb import Path import logging from datasets import Dataset # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # GitHub API constants GITHUB_API_URL = "https://api.github.com" GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") def get_github_repo_dataset(  repo_owner: str,  repo_name: str,  ise_number: Optional[int] = None,  get_solutions: bool = True,  close_ise: bool = False, ) -> Dataset:  """  Fetches GitHub repository data including ises and solutions with the most kes.  Args:  repo_owner: Owner of the GitHub repository  repo_name: Name of the GitHub repository  ise_number: Specific ise number to fetch (if None, fetches all open ises)  get_solutions: Whether to fetch solutions (comments) for the ises  close_ise: Whether to close the ise after fetching data  Returns:  Dataset containing the repository data  """  headers = {"Accept": "appcation/vnd.github.v3+json"}  if GITHUB_TOKEN:  headers["Authorization"] = f"token {GITHUB_TOKEN}"  else:  logger.warning(  "GITHUB_TOKEN not found in environment variables. API rate mits may apply."  )  # Construct the repository URL  repo_url = f"{GITHUB_API_URL}/repos/{repo_owner}/{repo_name}"  # Get repository information  logger.info(f"Fetching repository information for {repo_owner}/{repo_name}")  repo_response = requests.get(repo_url, headers=headers)  if repo_response.status_code != 200:  logger.error(  f"Failed to fetch repository: {repo_response.status_code} - {repo_response.text}"  )  raise Exception(f"Failed to fetch repository: {repo_response.status_code}")  repo_data = repo_response.json()  # Get ises  ises_data = []  if ise_number:  # Get specific ise  ise_url = f"{repo_url}/ises/{ise_number}"  ise_response = requests.get(ise_url, headers=headers)  if ise_response.status_code != 200:  logger.error(  f"Failed to fetch ise #{ise_number}: {ise_response.status_code} - {ise_response.text}"  )  raise Exception(  f"Failed to fetch ise #{ise_number}: {ise_response.status_code}"  )  ises_data = [ise_response.json()]  else:  # Get all open ises  ises_url = f"{repo_url}/ises"  ises_response = requests.get(  ises_url, headers=headers, params={"state": "open"}  )  if ises_response.status_code != 200:  logger.error(  f"Failed to fetch ises: {ises_response.status_code} - {ises_response.text}"  )  raise Exception(f"Failed to fetch ises: {ises_response.status_code}")  ises_data = ises_response.json()  # Process ises and get solutions if requested  processed_ises = []  for ise in ises_data:  ise_info = {  "ise_number": ise["number"],  "title": ise["title"],  "body": ise["body"],  "created_at": ise["created_at"],  "user": ise["user"]["login"],  "state": ise["state"],  "url": ise["html_url"],  }  # Get solutions (comments) if requested  if get_solutions and "comments_url" in ise and ise["comments"] > 0:  comments_response = requests.get(ise["comments_url"], headers=headers)  if comments_response.status_code == 200:  comments = comments_response.json()  # Sort comments by reactions (kes)  comments.sort(  key=lambda x: x.get("reactions", {}).get("total_count", 0),  reverse=True,  )  # Add the most ked solution  if comments:  ise_info["solution"] = comments[0]["body"]  ise_info["solution_user"] = comments[0]["user"]["login"]  ise_info["solution_kes"] = (  comments[0].get("reactions", {}).get("total_count", 0)  )  ise_info["solution_url"] = comments[0]["html_url"]  else:  logger.warning(  f"Failed to fetch comments for ise #{ise['number']}: {comments_response.status_code}"  )  processed_ises.append(ise_info)  # Close the ise if requested  if close_ise and ise_number:  close_url = f"{repo_url}/ises/{ise_number}"  close_data = {"state": "closed"}  close_response = requests.patch(close_url, headers=headers, json=close_data)  if close_response.status_code == 200:  logger.info(f"ccessfully closed ise #{ise_number}")  # Update the state in our processed data  for ise in processed_ises:  if ise["ise_number"] == ise_number:  ise["state"] = "closed"  else:  logger.error(  f"Failed to close ise #{ise_number}: {close_response.status_code} - {close_response.text}"  )  # Create dataset  dataset_data = {  "repository": {  "name": repo_data["name"],  "owner": repo_data["owner"]["login"],  "description": repo_data["description"],  "stars": repo_data["stargazers_count"],  "forks": repo_data["forks_count"],  "url": repo_data["html_url"],  },  "ises": processed_ises,  }  # Convert to Dataset format  rows = []  for ise in processed_ises:  row = {  "repo_name": f"{repo_owner}/{repo_name}",  "ise_number": ise["ise_number"],  "ise_title": ise["title"],  "ise_body": ise["body"],  "ise_url": ise["url"],  "ise_state": ise["state"],  }  if "solution" in ise:  row["solution"] = ise["solution"]  row["solution_user"] = ise["solution_user"]  row["solution_kes"] = ise["solution_kes"]  row["solution_url"] = ise["solution_url"]  rows.append(row)  # Create Dataset object  dataset = Dataset(rows=rows)  return dataset # Example usage: # dataset = get_github_repo_dataset("owner", "repo", ise_number=123, get_solutions=True, close_ise=True) 