I've been working on creating an individual final project for my python CS class that checks my teacher's website on a daily basis and determines if he's changed any of the web pages on his website since the last time the program ran or not.
I would really love some suggestions for improving my code, especially since it works now! I have added some functionality to it, so that it runs via a cron job on a cloud server and sends out an email when a page changes!
import requests ## downloads the html from bs4 import BeautifulSoup ## parses the html import filecmp ## compares files import os, sys ## used for renaming files import difflib ## used to see differences in link files import smtplib ## used for sending email from email.mime.multipart import MIMEMultipart ## used for areas of email such as subject, toaddr, fromaddr, etc. from email.mime.text import MIMEText ## used for areas of email such as body, etc. root_url = "https://sites.google.com" index_url = root_url + "/site/csc110winter2015/home" def get_site_links(): ''' Gets links from the website's list items' HTML elements ''' response = requests.get(index_url) soup = BeautifulSoup(response.text) links = [a.attrs.get('href') for a in soup.select('li.topLevel a[href^=/site/csc110winter2015/]')] return links def try_read_links_file(): ''' Tries to read the links.txt file; if links.txt is found, then rename links.txt to previous_links.txt ''' try: os.rename("links.txt", "previous_links.txt") write_links_file() except (OSError, IOError): print("No links.txt file exists; creating one now.") write_links_file() try_read_links_file() def write_links_file(): ''' Writes the links.txt file from the website's links ''' links = get_site_links() with open("links.txt", mode='wt', encoding='utf-8') as out_file: out_file.write('\n'.join(links)) def check_links(): ''' Checks to see if links have changed since the last time the program was run. ''' if filecmp.cmp("links.txt", "previous_links.txt") == True: ## If link data hasn't changed, do nothing pass else: ## Checks to see what changes, if any, have been made to the links, and outputs them to the console d = difflib.Differ() previous_links = open("previous_links.txt").readlines() links = open("links.txt").readlines() diff = d.compare(previous_links, links) for difference in diff: if '- ' in difference: print(difference.strip() + "\nWas a removed page from the CSC110 website since the last time checked.\n") elif '+ ' in difference: print(difference.strip() + "\nWas an added page to the CSC110 website since the last time checked.\n") def try_read_pages_files(): ''' Tries to read the pages .txt files; if pages .txt are found, then rename the pages .txt files to previous_ pages .txt ''' with open("links.txt", mode='r', encoding='utf-8') as pages: for page in pages: try: os.rename(page.replace("/",".") + ".txt", "previous_" + page.replace("/",".") + ".txt") except (OSError, IOError): print("No pages .txt file exists; creating them now.") write_pages_files() try_read_pages_files() ## Note that the call to write_pages_files() is outside the loop write_pages_files() def write_pages_files(): ''' Writes the various page files from the website's links ''' with open("links.txt") as links: for page in links: site_page = requests.get(root_url + page.strip()) soup = BeautifulSoup(site_page.text) souped_up = soup.find_all('div', class_= "sites-attachments-row") with open(page.replace("/",".") + ".txt", mode='wt', encoding='utf-8') as out_file: out_file.write(str(souped_up)) def check_pages(): ''' Checks to see if pages have changed since the last time the program was run. ''' with open("links.txt") as links: changed_pages =  for page in links: page = page.replace("/",".") if filecmp.cmp("previous_" + page + ".txt", page + ".txt") == True: ## If page data hasn't changed, do nothing pass else: ## If page data has changed, then write the changed page data to a list if page == '.site.csc110winter2015.system.app.pages.sitemap.hierarchy': pass else: changed_pages.append(root_url + page.replace(".","/").strip()) return changed_pages def send_mail(): server = smtplib.SMTP('smtp.gmail.com', 587) ## Say ehlo to my lil' friend! server.ehlo() ## Start Transport Layer Security for Gmail server.starttls() server.ehlo() if check_pages(): ## Setting up the email server.login("Sending Email", "Password") fromaddr = "Sending Email" toaddr = "Receiving Email" msg = MIMEMultipart() msg['From'] = fromaddr msg['To'] = toaddr msg['Subject'] = "Incoming CSC110 website changes!" # Can't return list and concatenate string; implemented here for check_pages() changed_pages = "The following page(s) have been updated:\n\n" + str(check_pages()) msg.attach(MIMEText(changed_pages, 'plain')) text = msg.as_string() server.sendmail(fromaddr, toaddr, text) def main(): try_read_links_file() try_read_pages_files() check_links() check_pages() send_mail() main()