openexploitdatabasescraper/open_exploit_database_scraping/exploit/management/commands/crawl.py at main · SoftwareDesignLab/openexploitdatabasescraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from scrapy.utils.project import get_project_settings
from django.core.management.base import BaseCommand
from scrapy.crawler import CrawlerProcess
from django_cog import cog
import os
# Import our spiders from our scrapy project.
from exploit_scrapers.exploit_scrapers.spiders import cxsecurity_spider, exploit_db_csv_spider, readme_spider, packetstorm_spider, readme_spider, nvd_spider
from open_exploit_database_scraping.utils.metasploit_to_exploit import metasploit_json_to_exploit

class Command(BaseCommand):
    help = "Will scrape the various website for exploit info."

    def add_arguments(self, parser):
        # Add the optional arguement flag for updating.
        parser.add_argument("n_days", nargs='?', type=str, help="The max number of days to look back from the current date.", default=None)
        # Add an all flag for saving all exploits, not just the ones with CVE IDs
        parser.add_argument("-a", "--all", action="store_true", help="A flag indicating to save all scraped exploits, even if they don't have a CVE ID.")


        parser.add_argument("-s", "--start_page", type=str, help="The page to start (packetstorm & cxsecurity)", default=None)
        parser.add_argument("-l", "--end_page", type=str, help="The page to end before (packetstorm & cxsecurity)", default=None)

        # Add no-run flags for each scraper so that each scraper can be selectively turned off.
        parser.add_argument("-e", "--exploitdb", action="store_true", help="A no-run flag for the ExploitDB scraper, if passed the scraper will not run.")
        parser.add_argument("-c", "--cxsecurity", action="store_true", help="A no-run flag for the CXSecurity scraper, if passed the scraper will not run.")
        parser.add_argument("-p", "--packetstorm", action="store_true", help="A no-run flag for the PacketStorm scraper, if passed the scraper will not run.")
        parser.add_argument("-r", "--repo", action="store_true", help="A no-run flag for the ReadMe/Github Repo scraper, if passed the scraper will not run.")
        parser.add_argument("-m", "--metasploit", action="store_true", help="A no-run flag for the Metasploit scraper, if passed the scraper will not run.")
        parser.add_argument("-n", "--nvd", action="store_true", help="A no-run flag for the NVD JSON Feeds scraper, if passed the scraper will not run.")


    def handle(self, *args, **kwargs):
        # Check if all of the scrapers have been marked to not run.
        if(kwargs["cxsecurity"] and kwargs["exploitdb"] and kwargs["packetstorm"] and kwargs["repo"] and kwargs["metasploit"] and kwargs["nvd"]):
            self.stdout.write("\n[INFO] All crawlers flagged for no-run, exiting script.\n")
            return
        settings_file_path = 'exploit_scrapers.exploit_scrapers.settings'
        os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_file_path)
        # Init a crawler process for spiders.
        process = CrawlerProcess(settings=get_project_settings())
        # Scrapers go here.
        if kwargs["cxsecurity"] is not True:
            process.crawl(cxsecurity_spider.CXSecuritySpider, kwargs["all"], kwargs["n_days"], kwargs["start_page"], kwargs["end_page"])

        if kwargs["exploitdb"] is not True:
            process.crawl(exploit_db_csv_spider.CSVSpider, kwargs["all"], kwargs["n_days"])

        if kwargs["packetstorm"] is not True:
            process.crawl(packetstorm_spider.PacketStormSpider, kwargs["all"], kwargs["n_days"], kwargs["start_page"], kwargs["end_page"])
        if kwargs["repo"] is not True:
            process.crawl(readme_spider.ReadMeSpider, kwargs["all"], kwargs["n_days"])
        if kwargs["metasploit"] is not True:
            if kwargs["n_days"]:
                # Run the metasploit scraper.
                # Really, what their script does is clone (or pull if already cloned) the metasploit repo and "scrape" the data from it.
                os.system("python git_vul_driller/parse_metasploit_metadata.py -r {}".format(kwargs["n_days"]))
                # Once the scraper is finished, use the results file to populate the django Exploit db.
                metasploit_json_to_exploit(kwargs["all"], kwargs["n_days"])
            else:
                os.system("python git_vul_driller/parse_metasploit_metadata.py")
                metasploit_json_to_exploit(kwargs["all"])
        if kwargs["nvd"] is not True:
            process.crawl(nvd_spider.NVDSpider, kwargs["all"], kwargs["n_days"])

        #Start the crawlers.
        self.stdout.write("\n[INFO] Crawler(s) starting...\n")
        process.start()
        # Once they are all crawled, stop the process.
        process.stop()
        self.stdout.write("\n[INFO] Crawling done.\n")