From 4a96c69ea0e446f0f22482233806e94276c83582 Mon Sep 17 00:00:00 2001 From: Roger Barnes Date: Mon, 16 Aug 2010 23:30:47 +1000 Subject: [PATCH] Created Willougby parser and moved common epathway code from wollongong into superclass --- scraper_factory.rb | 1 + scrapers/epathway_scraper.rb | 62 ++++++++++++++++++++++++++++++++++++++ scrapers/willoughby_scraper.rb | 61 +++++++++++++++++++++++++++++++++++++ scrapers/wollongong_scraper.rb | 65 +++++++++------------------------------ 4 files changed, 139 insertions(+), 50 deletions(-) create mode 100644 scrapers/epathway_scraper.rb create mode 100644 scrapers/willoughby_scraper.rb diff --git a/scraper_factory.rb b/scraper_factory.rb index 87c776b..08ecf5d 100644 --- a/scraper_factory.rb +++ b/scraper_factory.rb @@ -76,6 +76,7 @@ module Scrapers SPEARScraper.new("Melbourne City Council (SPEAR)", "Melbourne", "VIC", "Melbourne City Council"), MelbourneScraper.new("Melbourne City Council", "Melbourne (City)", "VIC"), WollongongScraper.new("Wollongong City Council", "Wollongong", "NSW"), + WilloughbyScraper.new("Willoughby City Council", "Willoughby", "NSW"), MarrickvilleScraper.new("Marrickville Council", "Marrickville", "NSW"), CGIScraper.new("Department of Planning and Local Government", "EDALA", "SA", "php-cgi -d short_open_tag=0 -d cgi.force_redirect=0 -f", "edala.php"), KogarahScraper.new("Kogarah City Council", "Kogarah", "NSW"), diff --git a/scrapers/epathway_scraper.rb b/scrapers/epathway_scraper.rb new file mode 100644 index 0000000..a6903ac --- /dev/null +++ b/scrapers/epathway_scraper.rb @@ -0,0 +1,62 @@ +require 'scraper' + +# This is using the ePathway system. + +class EpathwayScraper < Scraper + def extract_urls_from_page(page) + content = page.at('table.ContentPanel') + if content + content.search('tr')[1..-1].map do |app| + extract_relative_url(app.search('td')[0]) + end + else + [] + end + end + + def extract_field(field, label) + raise "unexpected form" unless field.search('td')[0].inner_text == label + field.search('td')[1].inner_text.strip + end + + def choose_search_type(option_index) + page = agent.get(enquiry_url) + form = page.forms.first + form.radiobuttons[option_index].click + page = form.submit(form.button_with(:name => /Continue/)) + page.forms.first + end + + def enter_date_range(date, form) + # Going to enter a date range + form.radiobutton_with(:value => /DateRange/).click + formatted_date = "#{date.day}/#{date.month}/#{date.year}" + form.field_with(:name => /DateFrom/).value = formatted_date + form.field_with(:name => /DateTo/).value = formatted_date + + form.submit(form.button_with(:name => /Search/)) + end + + def parse_search_results(page) + page_label = page.at('span#ctl00_MainBodyContent_mPageNumberLabel') + if page_label.nil? + # If we can't find the label assume there is only one page of results + number_of_pages = 1 + elsif page_label.inner_text =~ /Page \d+ of (\d+)/ + number_of_pages = $~[1].to_i + else + raise "Unexpected form for number of pages" + end + urls = [] + (1..number_of_pages).each do |page_no| + # Don't refetch the first page + if page_no > 1 + page = agent.get("https://#{enquiry_domain}/ePathway/Production/Web/GeneralEnquiry/EnquirySummaryView.aspx?PageNumber=#{page_no}") + end + # Get a list of urls on this page + urls += extract_urls_from_page(page) + end + urls + end + +end diff --git a/scrapers/willoughby_scraper.rb b/scrapers/willoughby_scraper.rb new file mode 100644 index 0000000..020ea3b --- /dev/null +++ b/scrapers/willoughby_scraper.rb @@ -0,0 +1,61 @@ +require 'scraper' + +# This is using the ePathway system. + +class WilloughbyScraper < EpathwayScraper + # The domain + def enquiry_domain + "epathway.willoughby.nsw.gov.au" + end + + # The main url for the planning system which can be reached directly without getting a stupid session timed out error + def enquiry_url + "https://#{enquiry_domain}/ePathway/Production/Web/GeneralEnquiry/EnquiryLists.aspx" + end + + # Returns a list of URLs for all the applications submitted on the given date + def urls(date) + # Get through the "general enquiry" screen + form = choose_search_type(0) + + # Dodge the javascript ASP form submission to jump to the date based search tab + form.add_field!("__EVENTTARGET", "ctl00$MainBodyContent$mGeneralEnquirySearchControl$mSearchTabStrip") + form.add_field!("__EVENTARGUMENT", "2") + form.field_with(:name => /mSearchTabStrip_State/).value = "2" + page = form.submit + form = page.forms.first + + # Going to enter a date range + page = enter_date_range(date, form) + + urls = parse_search_results(page) + urls + end + + def applications(date) + urls = urls(date) + urls.map do |url| + page = agent.get(url) + table = page.search('table#ctl00_MainBodyContent_DynamicTable > tr')[0].search('td')[0].search('table')[2] + + date_received = extract_field(table.search('tr')[1], "Lodgement Date") + #puts "date received: #{date_received}" + + application_id = extract_field(table.search('tr')[0], "Number") + #puts "application id: #{application_id}" + + description = simplify_whitespace(extract_field(table.search('tr')[2], "Description")) + #puts "description: #{description}" + addresses = [simplify_whitespace(extract_field(table.search('tr')[3], "Location"))] + #puts "addresses: #{addresses[0]}" + DevelopmentApplication.new( + :date_received => date_received, + :application_id => application_id, + :description => description, + :addresses => addresses, + :info_url => enquiry_url, + :comment_url => enquiry_url) + end + end + +end diff --git a/scrapers/wollongong_scraper.rb b/scrapers/wollongong_scraper.rb index 8a463ed..5bc16a3 100644 --- a/scrapers/wollongong_scraper.rb +++ b/scrapers/wollongong_scraper.rb @@ -2,66 +2,29 @@ require 'scraper' # This is using the ePathway system. -class WollongongScraper < Scraper - def extract_urls_from_page(page) - content = page.at('table.ContentPanel') - if content - content.search('tr')[1..-1].map do |app| - extract_relative_url(app.search('td')[0]) - end - else - [] - end +class WollongongScraper < EpathwayScraper + # The domain + def enquiry_domain + "epathway.wollongong.nsw.gov.au" end # The main url for the planning system which can be reached directly without getting a stupid session timed out error def enquiry_url - "https://epathway.wollongong.nsw.gov.au/ePathway/Production/Web/GeneralEnquiry/EnquiryLists.aspx" + "https://#{enquiry_domain}/ePathway/Production/Web/GeneralEnquiry/EnquiryLists.aspx" end - + # Returns a list of URLs for all the applications submitted on the given date def urls(date) - page = agent.get(enquiry_url) - form = page.forms.first - form.radiobuttons[1].click - page = form.submit(form.button_with(:name => /Continue/)) - form = page.forms.first + # Get through the "general enquiry" screen + form = choose_search_type(1) + # Going to enter a date range - form.radiobutton_with(:value => /DateRange/).click - formatted_date = "#{date.day}/#{date.month}/#{date.year}" - form.field_with(:name => /DateFrom/).value = formatted_date - form.field_with(:name => /DateTo/).value = formatted_date - - page = form.submit(form.button_with(:name => /Search/)) - #p page.parser + page = enter_date_range(date, form) - #exit - page_label = page.at('span#ctl00_MainBodyContent_mPageNumberLabel') - if page_label.nil? - # If we can't find the label assume there is only one page of results - number_of_pages = 1 - elsif page_label.inner_text =~ /Page \d+ of (\d+)/ - number_of_pages = $~[1].to_i - else - raise "Unexpected form for number of pages" - end - urls = [] - (1..number_of_pages).each do |page_no| - # Don't refetch the first page - if page_no > 1 - page = agent.get("https://epathway.wollongong.nsw.gov.au/ePathway/Production/Web/GeneralEnquiry/EnquirySummaryView.aspx?PageNumber=#{page_no}") - end - # Get a list of urls on this page - urls += extract_urls_from_page(page) - end + urls = parse_search_results(page) urls end - - def extract_field(field, label) - raise "unexpected form" unless field.search('td')[0].inner_text == label - field.search('td')[1].inner_text.strip - end - + def applications(date) urls = urls(date) urls.map do |url| @@ -94,4 +57,6 @@ class WollongongScraper < Scraper :comment_url => enquiry_url) end end -end \ No newline at end of file + + +end -- 1.7.0.4