Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1517507
  • 博文数量: 399
  • 博客积分: 8508
  • 博客等级: 中将
  • 技术积分: 5302
  • 用 户 组: 普通用户
  • 注册时间: 2009-10-14 09:28
个人简介

能力强的人善于解决问题,有智慧的人善于绕过问题。 区别很微妙,小心谨慎做后者。

文章分类

全部博文(399)

文章存档

2018年(3)

2017年(1)

2016年(1)

2015年(69)

2013年(14)

2012年(17)

2011年(12)

2010年(189)

2009年(93)

分类: 架构设计与优化

2015-04-21 10:24:16

'''University of Texas at Dallas''
import re
import sys
from packages.myedu import out
from packages.myedu.schedule.script import Script

class Scraper(Script):

    def __init__(self, **kwds):
        super(Scraper, self).__init__(
            school_id="114",
            school_name="University of Texas at Dallas",
            url="%s/",
            **kwds)

        # For building

        sem_codes = {'Fall': 'f', 'Spring': 's', 'Summer': 'u'}

        self.page.baseURL = "guidedsearch"

        self.URL += 'term_%s%s' % (self.sem_year[2:], sem_codes[self.sem_name])

        # For scraping
        self.cached_dept_by_name_map = {}

    def create_cache(self):

        self.page.nav(self.page.baseURL, True)

        HTML = self.get_lxml_of_page()

        matching_departments = self.get_matching_departments(HTML)

        print 'Looping departments\n'

        dept_abre_mapping = {}

        for dept in matching_departments:
            option = str(dept)
            dept_all = re.search(r'<.*>(.*)-(.*)<.*>', option)
            dept_abre_mapping[dept_all.group(2).strip()] = dept_all.group(1)[:4].strip()

        # Save the mapping for scrape late use
        super(Scraper, self).create_dept_list_file(HTML, dept_abre_mapping)

        ####################################################################################

        # Navigate to all course page since URL is already known

        dept_count = 1

        for dept in dept_abre_mapping.values():

            dept_page = self.URL % dept

            # Print progress
            sys.stdout.write('\r{0:.0f}%'.format(dept_count / float(len(matching_departments)) * 100))
            dept_count += 1

            self.page.nav(dept_page, True)

            self.wait_for_page_load()

            try:
                self.cache_page()
            except ValueError:
                pass

    def get_matching_departments(self, HTML):
        return self.page.xfind("//select[@id='combobox_cp']/option[@value!='+' and @value!='++']")

    def check_sem_availability(self):
        return True

    def scrape_cached_page(self, HTML):

        # create a reverse map of dept_name --> dept_abbrv
        dept_items = self.dept_list.items()
        for k, v in dept_items:
            self.cached_dept_by_name_map[v.upper()] = k

        section = out.Section()

        # The column count of which is 16
        courses = HTML.xpath("//div[@class='section-list']/table/tbody/tr[count(td) = 7]")

        if not courses:
            return

        fist_section = True

        for course in courses:

            columns = course.xpath("td")

            if fist_section:
                fist_section = False
            else:
                self.sectionList.appendSection(section)
                section = self.create_new_section_with_history(section)

            #Column 1
            class_section = columns[1].xpath("a")[0].text_content().split()
            unique_number = columns[1].xpath("br[1]/following-sibling::text()")[0].strip()

            section.courseNumber = class_section[1].split('.')[0]
            section.sectionNumber = class_section[1].split('.')[1]
            section.uniqueNumber = unique_number

            section.deptAbbrev = class_section[0].strip()
            section.deptName = self.cached_dept_by_name_map[section.deptAbbrev]

            # Column 2
            class_credit = re.search(r'(.*)\((.*)\)', columns[2].text_content().strip())
            class_name = class_credit.group(1).strip()
            credit_hours = class_credit.group(2).strip()
            section.courseName = class_name

            if "Credits" in credit_hours:
                section.creditHours = re.search(r'\s*(\d+)\s*Credits', credit_hours).group(1).strip()

            # Column 3
            section.profs.append(columns[3].text_content().strip())

            # Column 4
            if 'Online' in columns[4].text_content().strip():
                section.meetingTypes = ['online']

            elif 'not' in columns[4].text_content().strip():
                section.rooms.append('TBA')
                section.days.append('TBA')
                section.times.append('TBA')

            else:
                section.meetingTypes = ['in-class']

                schedules = columns[4].xpath("./br[position() mod 3 = 1]")

                for schedule in schedules:

                    anchor = schedule.xpath("./following-sibling::a[1][text()]")

                    if anchor:
                        location = anchor[0].text_content()
                    else:
                        location = schedule.xpath("./following-sibling::text()[1]")

                    time_info = schedule.xpath("./preceding-sibling::text()[1]")

                    schedule_info = re.search(r'([^:]+):(.*)', time_info[0].strip())

                    days = schedule_info.group(1).strip().replace("&", "").replace(" ", "")
                    times = schedule_info.group(2).strip()

                    section.days.append(days)
                    section.times.append(times)

                    if 'TBD' not in location:
                        section.rooms.append(location)
                    else:
                        section.rooms.append("TBA")
阅读(1242) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~