From c60a6d422901176813bea4123ce1a8ecec541cb2 Mon Sep 17 00:00:00 2001 From: Ezri Brimhall Date: Tue, 8 Oct 2024 15:03:52 -0600 Subject: [PATCH] Added CSV parsing with Python --- README.md | 2 ++ parse-csv/README.md | 7 ++++ parse-csv/parse-csv.py | 74 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 parse-csv/README.md create mode 100755 parse-csv/parse-csv.py diff --git a/README.md b/README.md index 00c28f9..b090f65 100644 --- a/README.md +++ b/README.md @@ -28,5 +28,7 @@ As I am most familiar with Linux and Python, I elected to do the following exerc - [Exercise 1: User Report](user-report) - [Exercise 2: Watchdog Script](size-watchdog) +- [Exercise 3: Remove an Invalid Character](invalid-char) - [Exercise 4: Condition Testing](condition-testing) - [Exercise 5: Log Parsing](python-logs) +- [Exercise 6: CSV Parsing](parse-csv) diff --git a/parse-csv/README.md b/parse-csv/README.md new file mode 100644 index 0000000..1ed672a --- /dev/null +++ b/parse-csv/README.md @@ -0,0 +1,7 @@ +## Parse a CSV file with Python + +For this task, I leveraged my experience in web software development, particularly with Python, to quickly create a script capable of downloading the requested CSV file and parse it into a list of dictionaries. This functionality exists in the `download_csv` function, and can be easily run from a Python interactive session with an arbitrary URL. Simply import the script file and call this function. Note that it assumes that the first line of the downloaded CSV file contains column headers. This is how it generates dictionaries from each line, and the given CSV file is in this format. + +From here, the data can be easily manipulated and searched. When the script is run from the command line, it will produce the requested report, counting active and inactive users, and validating the email, role, status, and created date for eeach user. Alternatively, the `main` function can be called from an interactive session after importing the file. + +For an exceptionally large CSV, this might become slow to search through, as it is just a native Python list. For crazy-large datasets (such as CSV-format log files spanning months or years), or when relating between multiple CSV files (like in a database), it would probably make more sense to use something like Pandas rather than the `csv` builtin library, as it makes use of the more efficient `numpy` arrays and can do indexing. diff --git a/parse-csv/parse-csv.py b/parse-csv/parse-csv.py new file mode 100755 index 0000000..30d35f0 --- /dev/null +++ b/parse-csv/parse-csv.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +import sys +import csv +import os +import requests +import re + +URL = "https://docs.google.com/spreadsheets/d/e/2PACX-1vR_SfJ7xUN3pcnRw3Z5b0DeINfFmZvWvquoJuBNS0UiA4h2xeftVlOml2BI162JiarV0XyvIOxaEJFQ/pub?gid=0&single=true&output=csv" + + +def download(url): + response = requests.get(url) + content = response.content.decode("utf-8") + return content + + +def download_csv(url): + """ + Downloads the CSV file at the given url and parses it. + + Treats the first row as a header, and returns a list of dictionaries + built from subsequent rows, with the header row's fields as keys. + """ + response = download(url) + reader = csv.reader(response.splitlines()) + data = list(reader) + header = data[0] + data = [dict(zip(header, row)) for row in data[1:]] + + return header, data + + +def main(): + header, data = download_csv(URL) + + active = 0 + inactive = 0 + invalid = [] + for row in data: + reason = [] + if "@" not in row["email"] or "." not in row["email"].split("@")[1]: + reason.append(f"Invalid email address '{row['email']}'") + + if row["role"] not in ["admin", "user"]: + reason.append(f"Invalid role '{row['role']}'") + + if row["status"] not in ["active", "inactive"]: + reason.append(f"Invalid status '{row['status']}'") + elif row["status"] == "active": + active += 1 + else: + inactive += 1 + + if not re.search(r"^\d\d\d\d(-\d\d){2}", row["created_on"]): + reason.append(f"Invalid creation date '{row['created_on']}'") + + if len(reason) > 0: + invalid.append({"entry": row, "reason": reason}) + + print(f"Active users: {active}") + print(f"Inactive users: {inactive}") + + if len(invalid) > 0: + print(f"{len(invalid)} invalid entries found") + for row in invalid: + print(f"\t User: {row['entry']['username']}") + print(f"\t Reasons:") + for reason in row["reason"]: + print(f"\t\t{reason}") + + +if __name__ == "__main__": + main()