#!/opt/ruby/1.9.0/bin/ruby require 'net/http' require 'rexml/document' Entry = Struct.new( "Entry", :about, :question, :entries ) Poll = Struct.new( "Poll", :date, :approve, :disapprove, :undecided, :n ) class BushSucker include REXML def initialize force=false if force || !File.exists?( "polls.xml" ) url = "http://www.pollingreport.com/BushJob1.htm" resp = Net::HTTP.get( URI.parse( url ) ) $stderr.puts "Got page" resp.gsub!(//, "") resp.gsub!(//, "") resp.gsub!(/ /, " ") IO.popen( "tidy -asxml -nq -utf8 2>/dev/null", "w+" ) { |p| p.puts resp p.close_write resp = p.read } $stderr.puts "Cleaned page" File.open( "polls.xml", "w+" ) {|f| f.puts resp } $stderr.puts "Wrote cleaned page to polls.xml" else resp = IO.readlines( "polls.xml" ).join("\n") $stderr.puts "Read polls.xml" end t = Time.now doc = Document.new( resp ) t = Time.now - t $stderr.puts "* Parsed #{resp.length / 1000}kb page in #{t} seconds." outer_td = XPath.match( doc, '/html/body/table[2]/tr/td/div/div' ) # Tables of surveys outer = XPath.match( outer_td, './table' ) outer += XPath.match( outer_td, './div/table' ) $stderr.puts "Got #{outer.length} tables" entries = [] outer[1..-1].each { |table| STDERR.puts "New table" about = XPath.match( table, 'tr[1]//text()' ).collect{|x| x.to_s}.join(" ").gsub( /\n/, " ").squeeze(" ") $stderr.puts "Entry: #{about}" question = XPath.match( table, 'tr[3]//text()').collect{|x| x.to_s}.join(" ").gsub(/\n/, " ").squeeze(" ") entry = Entry.new( about, question, [] ) entries << entry trs = XPath.match( table, 'tr' ) trs[6..-1].each {|tr| date = XPath.match( tr, 'td[2]//text()' ).join.tr("\n ", "") approve = XPath.match( tr, 'td[3]//text()' ).join.gsub(/\n/, " ").squeeze(" ") disapprove = XPath.match( tr, 'td[4]//text()' ).join.gsub(/\n/, " ").squeeze(" ") undecided = XPath.match( tr, 'td[5]//text()' ).join.gsub(/\n/, " ").squeeze(" ") n = XPath.match( tr, 'td[7]//text()' ).join entry.entries << Poll.new( date, approve, disapprove, undecided, n ) $stderr.print "." ; $stdout.flush() } if trs.length > 7 $stderr.puts } STDERR.puts "Creating document" out = Document.new("") root = out.root entries.each { |entry| STDERR.print "\n*" ; STDERR.flush ent = root.add_element( "poll" ) ent.add_element( "source" ).text = entry.about ent.add_element( "question" ).text = entry.question entry.entries.each {|poll| STDERR.print "." ; STDERR.flush inst = ent.add_element( "instance" ) inst.add_element( "date" ).text = poll.date inst.add_element( "approve" ).text = poll.approve inst.add_element( "disapprove" ).text = poll.disapprove inst.add_element( "undecided" ).text = poll.undecided inst.add_element( "N" ).text = poll.n } } STDERR.puts "\nWriting" @output = "" out.write( @output, 0 ) STDERR.puts "Written" end def get_data return @output end end if $0 == __FILE__ bs = BushSucker.new puts bs.get_data end