美国的大学都非常忌讳将学样的数据开放给外部公司,所以从学校网页抓取数据便成了惟一的选择,基于 Python / WebDriver / XPath 实现,主要是模式的匹配。以后有新写的会逐渐加入。
'''Austin Community College District'''
import re
import sys
from packages.myedu import out
from packages.myedu.schedule.script import Script
class Scraper(Script):
def __init__(self, **kwds):
super(Scraper, self).__init__(
school_id="208",
school_name="Austin Community College District",
url="",
**kwds)
# For building
sem_codes = {'Fall': 'F', 'Spring': 'S', 'Summer': 'U'}
# In case that there are cross department courses
self.dept_abre_mapping = {
# 2015 Spring
'': '',
# 2015 Fall
'POFT': 'Office Administration',
'EDUC': 'Education Instruction',
'BMGT': 'Management',
'RSTO': 'Culinary Arts',
'ITSW': 'Computer Information Technology',
# 2015 Summer
'ENGL': 'English',
'COMM': 'Journalism',
'PSYC': 'Psychology',
'DRAM': 'Drama',
'GAME': 'Computer Information Technology',
'PHIL': 'Philosophy',
'BUSG': 'Finance'
}
self.page.baseURL = self.URL[:]
self.URL += '?op=browse&opclass=ViewSched&term={0}{1}000&reporting_year={2}'\
.format(self.sem_year[:1] + self.sem_year[2:], sem_codes[self.sem_name], self.sem_year)
def create_cache(self):
self.page.nav(self.URL, True)
HTML = self.get_lxml_of_page()
matching_departments = self.get_matching_departments(HTML)
print 'Looping departments\n'
department_urls = []
for dept in matching_departments:
link = dept.get_attribute('href')
department_urls.append(link)
dept_count = 1
for dept in department_urls:
# Print progress
sys.stdout.write('\r{0:.0f}%'.format(dept_count / float(len(matching_departments)) * 100))
dept_count += 1
# Navigate to course page since URL is already known
self.page.nav(dept, True)
self.wait_for_page_load()
try:
self.cache_page()
except ValueError:
pass
def get_matching_departments(self, HTML):
return self.page.xfind('//ul/li//a')
def check_sem_availability(self):
return True
def scrape_cached_page(self, HTML):
# Department name
full_dept_name_node = HTML.xpath("//div[@id='class_legend']/following-sibling::h3[1]")
if not full_dept_name_node:
return
full_dept_name = full_dept_name_node[0].text_content().strip()
self.scrape_classroom_sections(full_dept_name, HTML)
self.scrape_online_distance_learning_sections(full_dept_name, HTML)
def get_department_name(self, dept_abbrev, full_dept_name):
if self.dept_abre_mapping.get(dept_abbrev):
return self.dept_abre_mapping[dept_abbrev]
else:
return full_dept_name
def scrape_online_distance_learning_sections(self, full_dept_name, HTML):
section = out.Section()
# Online distant courses, the column count of which is 13
courses = HTML.xpath("//table[@class='section_line'][count(tbody/tr/td) = 13]")
if not courses:
return
fist_section = True
for course in courses:
rows = course.xpath("tbody/tr")
# Example like "6 Week Session: July 8 - August 16"
start_end_date = course.xpath("preceding::p[@class='teach_term'][1]")
result = re.search('.*:(.*)-(.*)', start_end_date[0].text_content())
start_date = result.group(1).strip()
end_date = result.group(2).strip()
# Example like "ACCT 2301 Principles of Accounting I - Financial"
course_name_id = course.xpath("preceding::h4[contains(a/@title, 'Course Description')][1]/a")
result = re.search('(\D+)(\d+)(.*)', course_name_id[0].text_content())
dept_abbrev = result.group(1).strip()
course_number = result.group(2).strip()
course_name = result.group(3).strip()
# Each table only has one row, my god
row = rows[0]
columns = row.xpath("td")
# Synonym number
synonym_number = columns[4].text_content().strip()
######################################################################
if fist_section:
fist_section = False
elif synonym_number:
self.sectionList.appendSection(section)
section = self.create_new_section_with_history(section)
######################################################################
section.deptAbbrev = dept_abbrev
section.deptName = self.get_department_name(dept_abbrev, full_dept_name)
section.courseNumber = course_number
section.courseName = course_name
######################################################################
if synonym_number:
section.uniqueNumber = synonym_number
section.sectionNumber = columns[6].text_content().strip()
# Campus Code
campus_code = columns[7].text_content().strip()
if 'ONL' in campus_code:
section.meetingTypes = ['online']
else:
# Building Code
building = columns[7].text_content().strip()
# Room
room = columns[8].text_content().strip()
if building and room:
section.campuses.append(building[:-1])
section.rooms.append(building + '-' + room)
# Meeting Days
section.days.append(columns[9].text_content().strip())
# Meeting Time
section.times.append(columns[10].text_content().strip())
# Instructor, Link to the instructor's site if available
section.profs.append(columns[12].text_content().strip())
def scrape_classroom_sections(self, full_dept_name, HTML):
section = out.Section()
# The column count of which is 16
courses = HTML.xpath("//table[@class='section_line'][count(tbody/tr/td) = 16]")
if not courses:
return
fist_section = True
for course in courses:
rows = course.xpath("tbody/tr")
# Example like "6 Week Session: July 8 - August 16"
start_end_date = course.xpath("preceding::p[@class='teach_term'][1]")
result = re.search('.*:(.*)-(.*)', start_end_date[0].text_content())
start_date = result.group(1).strip()
end_date = result.group(2).strip()
# Example like "ACCT 2301 Principles of Accounting I - Financial"
course_name_id = course.xpath("preceding::h4[contains(a/@title, 'Course Description')][1]/a")
result = re.search('(\D+)(\d+)(.*)', course_name_id[0].text_content())
dept_abbrev = result.group(1).strip()
course_number = result.group(2).strip()
course_name = result.group(3).strip()
# Each table only has one row, my god
row = rows[0]
columns = row.xpath("td")
# Synonym number
synonym_number = columns[4].text_content().strip()
######################################################################
if fist_section:
fist_section = False
elif synonym_number:
self.sectionList.appendSection(section)
section = self.create_new_section_with_history(section)
######################################################################
section.deptAbbrev = dept_abbrev
section.deptName = self.get_department_name(dept_abbrev, full_dept_name)
section.courseNumber = course_number
section.courseName = course_name
######################################################################
if synonym_number:
section.uniqueNumber = synonym_number
section.sectionNumber = columns[6].text_content().strip()
section.meetingTypes = ['in-class']
# Campus Code
campus_code = columns[7].text_content().strip()
section.campuses.append(campus_code)
# Building Code
building = columns[8].text_content().strip()
# Room
room = columns[9].text_content().strip()
if building and room:
section.rooms.append(building + '-' + room)
# Meeting Days
section.days.append(columns[10].text_content().strip())
# Meeting Time
section.times.append(columns[11].text_content().strip())
# Instructor, Link to the instructor's site if available
section.profs.append(columns[15].text_content().strip())
阅读(1116) | 评论(0) | 转发(0) |