-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl.py
More file actions
71 lines (60 loc) · 4.62 KB
/
crawl.py
File metadata and controls
71 lines (60 loc) · 4.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from scrapy.utils.project import get_project_settings
from django.core.management.base import BaseCommand
from scrapy.crawler import CrawlerProcess
from django_cog import cog
import os
# Import our spiders from our scrapy project.
from exploit_scrapers.exploit_scrapers.spiders import cxsecurity_spider, exploit_db_csv_spider, readme_spider, packetstorm_spider, readme_spider, nvd_spider
from open_exploit_database_scraping.utils.metasploit_to_exploit import metasploit_json_to_exploit
class Command(BaseCommand):
help = "Will scrape the various website for exploit info."
def add_arguments(self, parser):
# Add the optional arguement flag for updating.
parser.add_argument("n_days", nargs='?', type=str, help="The max number of days to look back from the current date.", default=None)
# Add an all flag for saving all exploits, not just the ones with CVE IDs
parser.add_argument("-a", "--all", action="store_true", help="A flag indicating to save all scraped exploits, even if they don't have a CVE ID.")
parser.add_argument("-s", "--start_page", type=str, help="The page to start (packetstorm & cxsecurity)", default=None)
parser.add_argument("-l", "--end_page", type=str, help="The page to end before (packetstorm & cxsecurity)", default=None)
# Add no-run flags for each scraper so that each scraper can be selectively turned off.
parser.add_argument("-e", "--exploitdb", action="store_true", help="A no-run flag for the ExploitDB scraper, if passed the scraper will not run.")
parser.add_argument("-c", "--cxsecurity", action="store_true", help="A no-run flag for the CXSecurity scraper, if passed the scraper will not run.")
parser.add_argument("-p", "--packetstorm", action="store_true", help="A no-run flag for the PacketStorm scraper, if passed the scraper will not run.")
parser.add_argument("-r", "--repo", action="store_true", help="A no-run flag for the ReadMe/Github Repo scraper, if passed the scraper will not run.")
parser.add_argument("-m", "--metasploit", action="store_true", help="A no-run flag for the Metasploit scraper, if passed the scraper will not run.")
parser.add_argument("-n", "--nvd", action="store_true", help="A no-run flag for the NVD JSON Feeds scraper, if passed the scraper will not run.")
def handle(self, *args, **kwargs):
# Check if all of the scrapers have been marked to not run.
if(kwargs["cxsecurity"] and kwargs["exploitdb"] and kwargs["packetstorm"] and kwargs["repo"] and kwargs["metasploit"] and kwargs["nvd"]):
self.stdout.write("\n[INFO] All crawlers flagged for no-run, exiting script.\n")
return
settings_file_path = 'exploit_scrapers.exploit_scrapers.settings'
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_file_path)
# Init a crawler process for spiders.
process = CrawlerProcess(settings=get_project_settings())
# Scrapers go here.
if kwargs["cxsecurity"] is not True:
process.crawl(cxsecurity_spider.CXSecuritySpider, kwargs["all"], kwargs["n_days"], kwargs["start_page"], kwargs["end_page"])
if kwargs["exploitdb"] is not True:
process.crawl(exploit_db_csv_spider.CSVSpider, kwargs["all"], kwargs["n_days"])
if kwargs["packetstorm"] is not True:
process.crawl(packetstorm_spider.PacketStormSpider, kwargs["all"], kwargs["n_days"], kwargs["start_page"], kwargs["end_page"])
if kwargs["repo"] is not True:
process.crawl(readme_spider.ReadMeSpider, kwargs["all"], kwargs["n_days"])
if kwargs["metasploit"] is not True:
if kwargs["n_days"]:
# Run the metasploit scraper.
# Really, what their script does is clone (or pull if already cloned) the metasploit repo and "scrape" the data from it.
os.system("python git_vul_driller/parse_metasploit_metadata.py -r {}".format(kwargs["n_days"]))
# Once the scraper is finished, use the results file to populate the django Exploit db.
metasploit_json_to_exploit(kwargs["all"], kwargs["n_days"])
else:
os.system("python git_vul_driller/parse_metasploit_metadata.py")
metasploit_json_to_exploit(kwargs["all"])
if kwargs["nvd"] is not True:
process.crawl(nvd_spider.NVDSpider, kwargs["all"], kwargs["n_days"])
#Start the crawlers.
self.stdout.write("\n[INFO] Crawler(s) starting...\n")
process.start()
# Once they are all crawled, stop the process.
process.stop()
self.stdout.write("\n[INFO] Crawling done.\n")