Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions dev_rf/big_data/logs/logs.log

This file was deleted.

50 changes: 50 additions & 0 deletions dev_rf/big_data/with_chunks/chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import xml.etree.ElementTree as ET
from big_data_logging import configured_logger

logger = configured_logger()


def chunks(list, n=10):
"""Yield n number of striped chunks from list.
Parameters
----------
list : list
List that must be chunked
n : int
Number of chunks

Returns
-------
yield : generator
Generator with the different chunks

"""

for i in range(0, n):
yield list[i::n]


def generate_chunks():
"""Function that creates a generator with the different chunks

Returns
-------
generator : generator
returns a generator with the different chunks of the xml file
"""

logger.info("Starting the chunks module...")

# Load and parse the posts.xml file
tree = ET.parse("./112010 Meta Stack Overflow/posts.xml")
# Get the root of the xml
root = tree.getroot()

list_to_chunk = []

# Loop into each row element
for child in root:
# Get the attributes of each row element
list_to_chunk.append(child.attrib)

return chunks(list_to_chunk)
File renamed without changes.
33 changes: 33 additions & 0 deletions dev_rf/big_data/with_chunks/logs/logs.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"12/10/2022" - INFO - root - Starting the main module...
"12/10/2022" - INFO - root - Starting the chunks module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Took 11.4 seconds to finish the tasks
"12/10/2022" - INFO - root - Starting the shuffler module...
"12/10/2022" - INFO - root - Starting the reducer module...
"12/10/2022" - INFO - root - Took 11.5 seconds to finish the tasks
"12/10/2022" - INFO - root - Starting the main module...
"12/10/2022" - INFO - root - Starting the chunks module...
"12/10/2022" - INFO - root - Took 0.9 seconds to finish the tasks
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Took 10.9 seconds to finish the tasks
"12/10/2022" - INFO - root - Starting the shuffler module...
"12/10/2022" - INFO - root - Starting the reducer module...
"12/10/2022" - INFO - root - Took 11.0 seconds to finish the tasks
15 changes: 15 additions & 0 deletions dev_rf/big_data/with_chunks/logs/logs.log.2022-10-12_19
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"12/10/2022" - INFO - root - Starting the main module...
"12/10/2022" - INFO - root - Starting the chunks module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the shuffler module...
"12/10/2022" - INFO - root - Starting the reducer module...
"12/10/2022" - INFO - root - Took 10.8 seconds to finish the tasks
64 changes: 64 additions & 0 deletions dev_rf/big_data/with_chunks/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import time

from chunks import generate_chunks
from mapper import mapper
from shuffler import shuffler
from reducer import reducer
from big_data_logging import configured_logger
from save_into_file import save_into_file

logger = configured_logger()


def main():
"""Main function that calls the mapper, shuffler and reducer functions and then save to csv the results"""
logger.info("Starting the main module...")

start_time = time.time()

# Create chunks
chunks = generate_chunks()

duration = time.time() - start_time
logger.info(f"Took {round(duration,1)} seconds to finish the tasks")

# Initialize lists for the results
post_views_list = []
mapped_tags_list = []
score_answertime_list = []

for chunk in chunks:
# Use the mapper function to map the different tasks
post_views, mapped_tags, score_answertime = mapper(chunk)

# Append result to list of results
post_views_list.append(post_views)
mapped_tags_list.append(mapped_tags)
score_answertime_list.append(score_answertime)

duration = time.time() - start_time
logger.info(f"Took {round(duration,1)} seconds to finish the tasks")

# Make the lists of lists only one list
post_views_list = sum(post_views_list, [])
mapped_tags_list = sum(mapped_tags_list, [])
score_answertime_list = sum(score_answertime_list, [])

# Use the shuffler function to shuffle the mapped tasks
post_views_list, mapped_tags_list, score_answertime_list = shuffler(
post_views_list, mapped_tags_list, score_answertime_list
)

# Use the reducer function to reduce the shuffled tasksS
top10_post_views, tags_reduced, average_answer_time = reducer(
post_views_list, mapped_tags_list, score_answertime_list
)

save_into_file(top10_post_views, tags_reduced, average_answer_time)

duration = time.time() - start_time
logger.info(f"Took {round(duration,1)} seconds to finish the tasks")


if __name__ == "__main__":
main()
107 changes: 107 additions & 0 deletions dev_rf/big_data/with_chunks/mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import xml.etree.ElementTree as ET
import datetime
from big_data_logging import configured_logger

logger = configured_logger()


def get_answer_dict():
"""Function to get the answers ids with the creation dates

Parameters
----------
root : object
Element Tree root object

Returns
-------
answer_dict : dict
Dictionary that has as key the answer_id and as value the creation_date

"""
# Load and parse the posts.xml file
tree = ET.parse("./112010 Meta Stack Overflow/posts.xml")
# Get the root of the xml
root = tree.getroot()
# Initialize variable
answer_dict = {}

# Loop into each row to get the answer_id and creation_date
for child in root:
dict = child.attrib

# PostTypeId == 2 means that it is an answer. PostTypeId == 1 means is a question.
if dict["PostTypeId"] == "2":
answer_dict[dict["Id"]] = dict["CreationDate"]

return answer_dict


def mapper(chunk):
"""Function used to map the 3 required tasks:
- 1 - Top 10 posts views
- 2 - Top 10 words in tags
- 3 - Score and answer time

Returns
-------
post_views : list
List of dicts. Each dict has key 'Id' and 'ViewCount'.
mapped_tags : list
List of dicts. Each dict has as key and tag and a value 1.
score_answertime : list
List of dicts. Each dict has key 'Score' and 'ResponseTime' in hours.

"""
logger.info("Starting the mapper module...")

# Initialize variables
post_views = []
mapped_tags = []
score_answertime = []

# Get the answer dict. key=answer_id, value=CreationDate
answer_dict = get_answer_dict()

# Loop into each row element
for dict in chunk:

# 1 - Top 10 posts views
# Append to the list the post_id and the view_count of each post
post_views.append({"Id": dict["Id"], "ViewCount": int(dict["ViewCount"])})

# 2 - Top 10 words in tags
# If the post has a tag replace the <> and split to get the different words.
try:
tags = dict["Tags"].replace("<", " ").replace(">", " ").strip().split()
# Map each individual tag
for tag in tags:
mapped_tags.append({tag: 1})
except:
# If the post hasn't a tag then continue
continue

# 3 - Score and answer time
# If the post is a question
if dict["PostTypeId"] == "1":
# Get question score and creation_time
post_score = int(dict["Score"])
post_creation_time = datetime.datetime.fromisoformat(dict["CreationDate"])
try:
# Some posts haven't an accepted answer, so they will be skipped as they do not have an AcceptedAnswerId

# Get the accepted_answer_id
accepted_answer_id = dict["AcceptedAnswerId"]

# With the accepted_answer_id go to the answer_dict and take the creation_date value and transform it to datetime
accepted_answer_time = datetime.datetime.fromisoformat(answer_dict[accepted_answer_id])

# Calculate response time from question creation to accepted answer creation (in hours)
response_time = round((accepted_answer_time - post_creation_time).seconds / 3600, 2)

# Append the score and response time to a list of dicts
score_answertime.append({"Score": post_score, "ResponseTime": response_time})
except:
continue

return post_views, mapped_tags, score_answertime
File renamed without changes.
2 changes: 2 additions & 0 deletions dev_rf/big_data/with_chunks/results/AverageAnswerTime.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Average time to get an accepted answer (hours)
5.9158
25 changes: 25 additions & 0 deletions dev_rf/big_data/with_chunks/save_into_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
import pandas as pd


def save_into_file(top10_post_views, tags_reduced, average_answer_time):
# Create resultss folder if it doesnt exist
cwd = os.getcwd()

try:
if not os.path.exists(cwd + "/results"):
os.makedirs(cwd + "/results")
except:
print("Folder cannot be created")

# Save to a csv the Top 10 most viewed posts
top10_post_views_df = pd.DataFrame(top10_post_views)
top10_post_views_df.to_csv("./results/Top10MostViewedPosts.csv", index=False)

# Save to a csv the Top 10 tags
tags_reduced_df = pd.DataFrame(tags_reduced, columns=["Tag", "Count"])
tags_reduced_df.to_csv("./results/MostUsedTags.csv", index=False)

# Save to a csv the Average time to get an accepted answer
average_answer_time_serie = pd.Series(average_answer_time, name="Average time to get an accepted answer (hours)")
average_answer_time_serie.to_csv("./results/AverageAnswerTime.csv", index=False)
File renamed without changes.
29 changes: 29 additions & 0 deletions dev_rf/big_data/without_chunks/big_data_logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import logging
from logging import config
import os


def configured_logger():
"""This function is going to setup the ogger using the logger.cfg file.
The logger has 2 functions:
1 - Display the logging messages in the console
2 - Save the files to a log file every week (Every Sunday)"""

# Get current working directory
cwd = os.getcwd()

# Create logs folder if it does not exist
try:
if not os.path.exists(cwd + "/logs"):
os.makedirs(cwd + "/logs")
except:
print("Folder cannot be created")

# Load the logger.cfg file
# cwd + "big_data/logger.cfg"
config.fileConfig("logger.cfg")

# Create logger with the configuration
logger = logging.getLogger("root")

return logger
35 changes: 35 additions & 0 deletions dev_rf/big_data/without_chunks/logger.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[loggers]
keys=root

[handlers]
keys=consoleHandler, fileHandler

[formatters]
keys=myFormatter

[logger_root]
level=INFO
handlers=consoleHandler, fileHandler

# Handler to display in console
[handler_consoleHandler]
class=StreamHandler
level=INFO
formatter=myFormatter
args=(sys.stdout,)

#Handler to save the log in files
[handler_fileHandler]
class=handlers.TimedRotatingFileHandler
level=INFO
formatter=myFormatter
# Create a new log every Sunday and have 1 log file as backup
when='W6'
interval=1
backupCount=1
args=("logs/logs.log",)

# Formatter
[formatter_myFormatter]
format=%(asctime)s - %(levelname)s - %(name)s - %(message)s
datefmt="%d/%m/%Y"
5 changes: 5 additions & 0 deletions dev_rf/big_data/without_chunks/logs/logs.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"12/10/2022" - INFO - root - Starting the main module...
"12/10/2022" - INFO - root - Starting the mapper module...
"12/10/2022" - INFO - root - Starting the shuffler module...
"12/10/2022" - INFO - root - Starting the reducer module...
"12/10/2022" - INFO - root - Took 1.5 seconds to finish the tasks
File renamed without changes.
Loading