-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch.py
More file actions
128 lines (105 loc) · 5.13 KB
/
fetch.py
File metadata and controls
128 lines (105 loc) · 5.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import requests
import base64
import time
import hashlib
import csv
import random
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
OUTPUT_DIR = "datasets"
META_FILE = os.path.join(OUTPUT_DIR, "metadata.csv")
NUM_FILES_PER_LANG = 2000
FILES_PER_KEYWORD = 10
os.makedirs(OUTPUT_DIR, exist_ok=True)
headers = {"Authorization": f"token {GITHUB_TOKEN}"}
seen_hashes = set()
if os.path.exists(META_FILE):
with open(META_FILE, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
seen_hashes.add(row["content_hash"])
# Keywords for searching repos
KEYWORDS = [
'codeforces', 'leetcode', 'hackerrank', 'codechef', 'codewars',
"data-structures", "algorithms", "interview-prep",
"django", "flask", "fastapi", "rest-api", "web-scraper",
"tensorflow", "pytorch", "scikit-learn", "data-analysis", "ml-projects",
"automation", "scripts", "cli-tool", "python-utils",
"python-tutorial", "python-examples", "beginner-python", "learn-python",
"javascript project", "js project", "nodejs", "react", "vue", "angular",
"express", "nextjs", "nuxtjs", "svelte",
"frontend", "backend", "fullstack", "web-app", "rest-api",
"beginner-javascript", "javascript-examples", "learn-javascript",
"university", "practice", "tutorial", "ai"
]
def search_files(query, per_page=50, page=1):
url = f"https://api.github.com/search/code?q={query}&per_page={per_page}&page={page}"
r = requests.get(url, headers=headers)
if r.status_code == 200:
return r.json().get("items", [])
return []
def download_file(repo_full_name, path):
url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}"
r = requests.get(url, headers=headers)
if r.status_code != 200:
return None
content = r.json().get("content", "")
encoding = r.json().get("encoding", "")
if encoding == "base64":
return base64.b64decode(content).decode("utf-8", errors="ignore")
return None
def collect_files():
start_time = time.time()
collected = {"Python": len([h for h in seen_hashes if h.endswith(".py")]),
"JavaScript": len([h for h in seen_hashes if h.endswith(".js")])}
with open(META_FILE, "a", newline="", encoding="utf-8") as meta:
writer = csv.writer(meta)
if os.stat(META_FILE).st_size == 0:
writer.writerow(["local_file", "repo_url", "original_path", "content_hash", "download_time", "keyword", "language"])
for lang in ["Python", "JavaScript"]:
lang_files_needed = NUM_FILES_PER_LANG - collected[lang]
if lang_files_needed <= 0:
continue
while collected[lang] < NUM_FILES_PER_LANG:
random.shuffle(KEYWORDS)
for kw in KEYWORDS:
page = 1
files_downloaded = 0
while files_downloaded < FILES_PER_KEYWORD:
files = search_files(f"{kw} language:{lang}", per_page=50, page=page)
if not files:
break
for item in files:
repo_name = item["repository"]["full_name"]
repo_url = item["repository"]["html_url"]
path = item["path"]
try:
content = download_file(repo_name, path)
if not content or len(content.splitlines()) < 10:
continue
content_hash = hashlib.md5(content.encode()).hexdigest()
if content_hash in seen_hashes:
continue
seen_hashes.add(content_hash)
repo_safe = repo_name.replace("/", "_")
local_name = f"{repo_safe}_{path.split('/')[-1]}"
save_path = os.path.join(OUTPUT_DIR, local_name)
with open(save_path, "w", encoding="utf-8") as f:
f.write(content)
writer.writerow([local_name, repo_url, path, content_hash,
round(time.time() - start_time, 2), kw, lang])
collected[lang] += 1
files_downloaded += 1
print(f"[+] Saved {local_name} ({collected[lang]}/{NUM_FILES_PER_LANG}) [{kw}] [{lang}]")
if collected[lang] >= NUM_FILES_PER_LANG or files_downloaded >= FILES_PER_KEYWORD:
break
except Exception:
continue
page += 1
time.sleep(0.1)
if collected[lang] >= NUM_FILES_PER_LANG:
break
total_time = round(time.time() - start_time, 2)
print(f"\n✅ Done! Collected Python: {collected['Python']} | JavaScript: {collected['JavaScript']} in {total_time}s")
if __name__ == "__main__":
collect_files()