'''
University of Texas at Dallas''
import re
import sys
from packages.myedu import out
from packages.myedu.schedule.script import Script
class Scraper(Script):
def __init__(self, **kwds):
super(Scraper, self).__init__(
school_id="114",
school_name="University of Texas at Dallas",
url="%s/",
**kwds)
# For building
sem_codes = {'Fall': 'f', 'Spring': 's', 'Summer': 'u'}
self.page.baseURL = "guidedsearch"
self.URL += 'term_%s%s' % (self.sem_year[2:], sem_codes[self.sem_name])
# For scraping
self.cached_dept_by_name_map = {}
def create_cache(self):
self.page.nav(self.page.baseURL, True)
HTML = self.get_lxml_of_page()
matching_departments = self.get_matching_departments(HTML)
print 'Looping departments\n'
dept_abre_mapping = {}
for dept in matching_departments:
option = str(dept)
dept_all = re.search(r'<.*>(.*)-(.*)<.*>', option)
dept_abre_mapping[dept_all.group(2).strip()] = dept_all.group(1)[:4].strip()
# Save the mapping for scrape late use
super(Scraper, self).create_dept_list_file(HTML, dept_abre_mapping)
####################################################################################
# Navigate to all course page since URL is already known
dept_count = 1
for dept in dept_abre_mapping.values():
dept_page = self.URL % dept
# Print progress
sys.stdout.write('\r{0:.0f}%'.format(dept_count / float(len(matching_departments)) * 100))
dept_count += 1
self.page.nav(dept_page, True)
self.wait_for_page_load()
try:
self.cache_page()
except ValueError:
pass
def get_matching_departments(self, HTML):
return self.page.xfind("//select[@id='combobox_cp']/option[@value!='+' and @value!='++']")
def check_sem_availability(self):
return True
def scrape_cached_page(self, HTML):
# create a reverse map of dept_name --> dept_abbrv
dept_items = self.dept_list.items()
for k, v in dept_items:
self.cached_dept_by_name_map[v.upper()] = k
section = out.Section()
# The column count of which is 16
courses = HTML.xpath("//div[@class='section-list']/table/tbody/tr[count(td) = 7]")
if not courses:
return
fist_section = True
for course in courses:
columns = course.xpath("td")
if fist_section:
fist_section = False
else:
self.sectionList.appendSection(section)
section = self.create_new_section_with_history(section)
#Column 1
class_section = columns[1].xpath("a")[0].text_content().split()
unique_number = columns[1].xpath("br[1]/following-sibling::text()")[0].strip()
section.courseNumber = class_section[1].split('.')[0]
section.sectionNumber = class_section[1].split('.')[1]
section.uniqueNumber = unique_number
section.deptAbbrev = class_section[0].strip()
section.deptName = self.cached_dept_by_name_map[section.deptAbbrev]
# Column 2
class_credit = re.search(r'(.*)\((.*)\)', columns[2].text_content().strip())
class_name = class_credit.group(1).strip()
credit_hours = class_credit.group(2).strip()
section.courseName = class_name
if "Credits" in credit_hours:
section.creditHours = re.search(r'\s*(\d+)\s*Credits', credit_hours).group(1).strip()
# Column 3
section.profs.append(columns[3].text_content().strip())
# Column 4
if 'Online' in columns[4].text_content().strip():
section.meetingTypes = ['online']
elif 'not' in columns[4].text_content().strip():
section.rooms.append('TBA')
section.days.append('TBA')
section.times.append('TBA')
else:
section.meetingTypes = ['in-class']
schedules = columns[4].xpath("./br[position() mod 3 = 1]")
for schedule in schedules:
anchor = schedule.xpath("./following-sibling::a[1][text()]")
if anchor:
location = anchor[0].text_content()
else:
location = schedule.xpath("./following-sibling::text()[1]")
time_info = schedule.xpath("./preceding-sibling::text()[1]")
schedule_info = re.search(r'([^:]+):(.*)', time_info[0].strip())
days = schedule_info.group(1).strip().replace("&", "").replace(" ", "")
times = schedule_info.group(2).strip()
section.days.append(days)
section.times.append(times)
if 'TBD' not in location:
section.rooms.append(location)
else:
section.rooms.append("TBA")