Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1588599
  • 博文数量: 399
  • 博客积分: 8508
  • 博客等级: 中将
  • 技术积分: 5302
  • 用 户 组: 普通用户
  • 注册时间: 2009-10-14 09:28
个人简介

能力强的人善于解决问题,有智慧的人善于绕过问题。 区别很微妙,小心谨慎做后者。

文章分类

全部博文(399)

文章存档

2018年(3)

2017年(1)

2016年(1)

2015年(69)

2013年(14)

2012年(17)

2011年(12)

2010年(189)

2009年(93)

分类: 架构设计与优化

2015-04-21 10:19:38

美国的大学都非常忌讳将学样的数据开放给外部公司,所以从学校网页抓取数据便成了惟一的选择,基于 Python / WebDriver / XPath 实现,主要是模式的匹配。以后有新写的会逐渐加入。

'''Austin Community College District'''
import re
import sys
from packages.myedu import out
from packages.myedu.schedule.script import Script

class Scraper(Script):

    def __init__(self, **kwds):
        super(Scraper, self).__init__(
            school_id="208",
            school_name="Austin Community College District",
            url="",
            **kwds)

        # For building

        sem_codes = {'Fall': 'F', 'Spring': 'S', 'Summer': 'U'}

        # In case that there are cross department courses
        self.dept_abre_mapping = {
            # 2015 Spring
            '': '',

            # 2015 Fall
            'POFT': 'Office Administration',
            'EDUC': 'Education Instruction',
            'BMGT': 'Management',
            'RSTO': 'Culinary Arts',
            'ITSW': 'Computer Information Technology',

            # 2015 Summer
            'ENGL': 'English',
            'COMM': 'Journalism',
            'PSYC': 'Psychology',
            'DRAM': 'Drama',
            'GAME': 'Computer Information Technology',
            'PHIL': 'Philosophy',
            'BUSG': 'Finance'
        }

        self.page.baseURL = self.URL[:]

        self.URL += '?op=browse&opclass=ViewSched&term={0}{1}000&reporting_year={2}'\
            .format(self.sem_year[:1] + self.sem_year[2:], sem_codes[self.sem_name], self.sem_year)

    def create_cache(self):
        self.page.nav(self.URL, True)
        HTML = self.get_lxml_of_page()
        matching_departments = self.get_matching_departments(HTML)

        print 'Looping departments\n'

        department_urls = []
        for dept in matching_departments:
            link = dept.get_attribute('href')
            department_urls.append(link)

        dept_count = 1

        for dept in department_urls:

            # Print progress
            sys.stdout.write('\r{0:.0f}%'.format(dept_count / float(len(matching_departments)) * 100))
            dept_count += 1

            # Navigate to course page since URL is already known
            self.page.nav(dept, True)
            self.wait_for_page_load()

            try:
                self.cache_page()
            except ValueError:
                pass

    def get_matching_departments(self, HTML):
        return self.page.xfind('//ul/li//a')

    def check_sem_availability(self):
        return True

    def scrape_cached_page(self, HTML):

        # Department name
        full_dept_name_node = HTML.xpath("//div[@id='class_legend']/following-sibling::h3[1]")
        if not full_dept_name_node:
            return

        full_dept_name = full_dept_name_node[0].text_content().strip()

        self.scrape_classroom_sections(full_dept_name, HTML)
        self.scrape_online_distance_learning_sections(full_dept_name, HTML)

    def get_department_name(self, dept_abbrev, full_dept_name):
        if self.dept_abre_mapping.get(dept_abbrev):
            return self.dept_abre_mapping[dept_abbrev]
        else:
            return full_dept_name

    def scrape_online_distance_learning_sections(self, full_dept_name, HTML):

        section = out.Section()

        # Online distant courses, the column count of which is 13

        courses = HTML.xpath("//table[@class='section_line'][count(tbody/tr/td) = 13]")

        if not courses:
            return

        fist_section = True

        for course in courses:

            rows = course.xpath("tbody/tr")

            # Example like "6 Week Session: July 8 - August 16"
            start_end_date = course.xpath("preceding::p[@class='teach_term'][1]")

            result = re.search('.*:(.*)-(.*)', start_end_date[0].text_content())
            start_date = result.group(1).strip()
            end_date = result.group(2).strip()

            # Example like "ACCT 2301 Principles of Accounting I - Financial"
            course_name_id = course.xpath("preceding::h4[contains(a/@title, 'Course Description')][1]/a")
            result = re.search('(\D+)(\d+)(.*)', course_name_id[0].text_content())

            dept_abbrev = result.group(1).strip()
            course_number = result.group(2).strip()
            course_name = result.group(3).strip()

            # Each table only has one row, my god

            row = rows[0]

            columns = row.xpath("td")

            # Synonym number
            synonym_number = columns[4].text_content().strip()

            ######################################################################
            if fist_section:
                fist_section = False

            elif synonym_number:
                self.sectionList.appendSection(section)
                section = self.create_new_section_with_history(section)

            ######################################################################
            section.deptAbbrev = dept_abbrev
            section.deptName = self.get_department_name(dept_abbrev, full_dept_name)
            section.courseNumber = course_number
            section.courseName = course_name
            ######################################################################

            if synonym_number:
                section.uniqueNumber = synonym_number
                section.sectionNumber = columns[6].text_content().strip()
                # Campus Code
                campus_code = columns[7].text_content().strip()
                if 'ONL' in campus_code:
                    section.meetingTypes = ['online']
            else:
                # Building Code
                building = columns[7].text_content().strip()

                # Room
                room = columns[8].text_content().strip()

                if building and room:
                    section.campuses.append(building[:-1])
                    section.rooms.append(building + '-' + room)

                # Meeting Days
                section.days.append(columns[9].text_content().strip())

                # Meeting Time
                section.times.append(columns[10].text_content().strip())

            # Instructor, Link to the instructor's site if available
            section.profs.append(columns[12].text_content().strip())

    def scrape_classroom_sections(self, full_dept_name, HTML):

        section = out.Section()

        # The column count of which is 16
        courses = HTML.xpath("//table[@class='section_line'][count(tbody/tr/td) = 16]")

        if not courses:
            return

        fist_section = True

        for course in courses:

            rows = course.xpath("tbody/tr")

            # Example like "6 Week Session: July 8 - August 16"
            start_end_date = course.xpath("preceding::p[@class='teach_term'][1]")

            result = re.search('.*:(.*)-(.*)', start_end_date[0].text_content())
            start_date = result.group(1).strip()
            end_date = result.group(2).strip()

            # Example like "ACCT 2301 Principles of Accounting I - Financial"
            course_name_id = course.xpath("preceding::h4[contains(a/@title, 'Course Description')][1]/a")
            result = re.search('(\D+)(\d+)(.*)', course_name_id[0].text_content())

            dept_abbrev = result.group(1).strip()
            course_number = result.group(2).strip()
            course_name = result.group(3).strip()

            # Each table only has one row, my god

            row = rows[0]

            columns = row.xpath("td")

            # Synonym number
            synonym_number = columns[4].text_content().strip()

            ######################################################################
            if fist_section:
                fist_section = False

            elif synonym_number:
                self.sectionList.appendSection(section)
                section = self.create_new_section_with_history(section)

            ######################################################################
            section.deptAbbrev = dept_abbrev
            section.deptName = self.get_department_name(dept_abbrev, full_dept_name)
            section.courseNumber = course_number
            section.courseName = course_name
            ######################################################################

            if synonym_number:
                section.uniqueNumber = synonym_number
                section.sectionNumber = columns[6].text_content().strip()
                section.meetingTypes = ['in-class']

            # Campus Code
            campus_code = columns[7].text_content().strip()
            section.campuses.append(campus_code)

            # Building Code
            building = columns[8].text_content().strip()

            # Room
            room = columns[9].text_content().strip()

            if building and room:
                section.rooms.append(building + '-' + room)

            # Meeting Days
            section.days.append(columns[10].text_content().strip())

            # Meeting Time
            section.times.append(columns[11].text_content().strip())

            # Instructor, Link to the instructor's site if available
            section.profs.append(columns[15].text_content().strip())
阅读(1123) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~