rake tasks for importing geoip data

This commit is contained in:
Scott Comer 2014-03-09 17:35:12 -05:00
parent c1bea4ae03
commit af748cba75
5 changed files with 352 additions and 13 deletions

View File

@ -16,6 +16,7 @@ CREATE INDEX geoiplocations_geog_gix ON geoiplocations USING GIST (geog);
ALTER TABLE geoipblocks DROP COLUMN geom;
ALTER TABLE geoipblocks ADD COLUMN geom geometry(polygon);
-- DROP INDEX geoipblocks_geom_gix;
UPDATE geoipblocks SET geom = ST_MakeEnvelope(beginip, -1, endip, 1);
CREATE INDEX geoipblocks_geom_gix ON geoipblocks USING GIST (geom);
@ -24,13 +25,12 @@ ALTER TABLE jamisp ADD COLUMN geom geometry(polygon);
UPDATE jamisp SET geom = ST_MakeEnvelope(beginip, -1, endip, 1);
CREATE INDEX jamisp_geom_gix ON jamisp USING GIST (geom);
delete from cities;
insert into cities (city, region, countrycode) select distinct city, region, countrycode from geoiplocations where length(city) > 0 and length(countrycode) > 0;
DELETE FROM cities;
INSERT INTO cities (city, region, countrycode) SELECT DISTINCT city, region, countrycode FROM geoiplocations WHERE length(city) > 0 AND length(countrycode) > 0;
DELETE FROM regions;
INSERT INTO regions (region, countrycode) SELECT DISTINCT region, countrycode FROM cities;
DELETE FROM countries;
INSERT INTO countries (countrycode) SELECT DISTINCT countrycode FROM regions;
delete from regions;
insert into regions (region, countrycode) select distinct region, countrycode from cities;
delete from countries;
insert into countries (countrycode) select distinct countrycode from regions;
VACUUM ANALYSE;

View File

@ -15,7 +15,97 @@ module JamRuby
end
def self.import_from_max_mind(file)
# todo implement import_from_max_mind
# File Geo-134
# Format:
# startIpNum,endIpNum,locId
GeoIpBlocks.transaction do
GeoIpBlocks.delete_all
File.open(file, 'r:ISO-8859-1') do |io|
s = io.gets.strip # eat the copyright line. gah, why do they have that in their file??
unless s.eql? 'Copyright (c) 2011 MaxMind Inc. All Rights Reserved.'
puts s
puts 'Copyright (c) 2011 MaxMind Inc. All Rights Reserved.'
raise 'file does not start with expected copyright (line 1): Copyright (c) 2011 MaxMind Inc. All Rights Reserved.'
end
s = io.gets.strip # eat the headers line
unless s.eql? 'startIpNum,endIpNum,locId'
puts s
puts 'startIpNum,endIpNum,locId'
raise 'file does not start with expected header (line 2): startIpNum,endIpNum,locId'
end
saved_level = ActiveRecord::Base.logger ? ActiveRecord::Base.logger.level : 0
count = 0
stmt = "insert into #{GeoIpBlocks.table_name} (beginip, endip, locid) values"
vals = ''
sep = ''
i = 0
n = 20
csv = ::CSV.new(io, {encoding: 'ISO-8859-1', headers: false})
csv.each do |row|
raise "file does not have expected number of columns (3): #{row.length}" unless row.length == 3
beginip = MaxMindIsp.ip_address_to_int(MaxMindIsp.strip_quotes(row[0]))
endip = MaxMindIsp.ip_address_to_int(MaxMindIsp.strip_quotes(row[1]))
locid = row[2]
vals = vals+sep+"(#{beginip}, #{endip}, #{locid})"
sep = ','
i += 1
if count == 0 or i >= n then
GeoIpBlocks.connection.execute stmt+vals
count += i
vals = ''
sep = ''
i = 0
if ActiveRecord::Base.logger and ActiveRecord::Base.logger.level > 1 then
ActiveRecord::Base.logger.debug "... logging inserts into #{GeoIpBlocks.table_name} suspended ..."
ActiveRecord::Base.logger.level = 1
end
if ActiveRecord::Base.logger and count%10000 < n then
ActiveRecord::Base.logger.level = saved_level
ActiveRecord::Base.logger.debug "... inserted #{count} into #{GeoIpBlocks.table_name} ..."
ActiveRecord::Base.logger.level = 1
end
end
end
if i > 0 then
GeoIpBlocks.connection.execute stmt+vals
count += i
end
if ActiveRecord::Base.logger then
ActiveRecord::Base.logger.level = saved_level
ActiveRecord::Base.logger.debug "loaded #{count} records into #{GeoIpBlocks.table_name}"
end
sts = GeoIpBlocks.connection.execute 'ALTER TABLE geoipblocks DROP COLUMN geom;'
ActiveRecord::Base.logger.debug "DROP COLUMN geom returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
# sts.check [we don't care]
sts = GeoIpBlocks.connection.execute 'ALTER TABLE geoipblocks ADD COLUMN geom geometry(polygon);'
ActiveRecord::Base.logger.debug "ADD COLUMN geom returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = GeoIpBlocks.connection.execute 'UPDATE geoipblocks SET geom = ST_MakeEnvelope(beginip, -1, endip, 1);'
ActiveRecord::Base.logger.debug "SET geom returned sts #{sts.cmd_tuples}" if ActiveRecord::Base.logger
sts.check
sts = GeoIpBlocks.connection.execute 'CREATE INDEX geoipblocks_geom_gix ON geoipblocks USING GIST (geom);'
ActiveRecord::Base.logger.debug "CREATE INDEX geoipblocks_geom_gix returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
end
end
end
end
end

View File

@ -2,21 +2,149 @@ module JamRuby
class GeoIpLocations < ActiveRecord::Base
self.table_name = 'geoiplocations'
CITIES_TABLE = 'cities'
REGIONS_TABLE = 'regions'
COUNTRIES_TABLE = 'countries'
def self.lookup(locid)
GeoIpLocations.where(locid: locid)
self.where(locid: locid)
.limit(1)
.first
end
def self.createx(locid, countrycode, region, city, postalcode, latitude, longitude, metrocode, areacode)
c = connection.raw_connection
c.exec_params('insert into geoiplocations (locid, countrycode, region, city, postalcode, latitude, longitude, metrocode, areacode, geog) values($1, $2, $3, $4, $5, $6, $7, $8, $9, ST_SetSRID(ST_MakePoint($7, $6), 4326)::geography)',
c.exec_params("insert into #{self.table_name} (locid, countrycode, region, city, postalcode, latitude, longitude, metrocode, areacode, geog) values($1, $2, $3, $4, $5, $6, $7, $8, $9, ST_SetSRID(ST_MakePoint($7, $6), 4326)::geography)",
[locid, countrycode, region, city, postalcode, latitude, longitude, metrocode, areacode])
end
def self.i(s)
return 'NULL' if s.nil? or s.blank?
return s.to_i
end
def self.import_from_max_mind(file)
# todo implement import_from_max_mind
# File Geo-134
# Format:
# locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode
self.transaction do
self.delete_all
File.open(file, 'r:ISO-8859-1') do |io|
s = io.gets.strip # eat the copyright line. gah, why do they have that in their file??
unless s.eql? 'Copyright (c) 2012 MaxMind LLC. All Rights Reserved.'
puts s
puts 'Copyright (c) 2012 MaxMind LLC. All Rights Reserved.'
raise 'file does not start with expected copyright (line 1): Copyright (c) 2012 MaxMind LLC. All Rights Reserved.'
end
s = io.gets.strip # eat the headers line
unless s.eql? 'locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode'
puts s
puts 'locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode'
raise 'file does not start with expected header (line 2): locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode'
end
saved_level = ActiveRecord::Base.logger ? ActiveRecord::Base.logger.level : 0
count = 0
stmt = "INSERT INTO #{self.table_name} (locid, countrycode, region, city, postalcode, latitude, longitude, metrocode, areacode) VALUES"
vals = ''
sep = ''
i = 0
n = 20
csv = ::CSV.new(io, {encoding: 'ISO-8859-1', headers: false})
csv.each do |row|
raise "file does not have expected number of columns (9): #{row.length}" unless row.length == 9
locid = row[0]
countrycode = row[1]
region = row[2]
city = row[3]
postalcode = row[4]
latitude = row[5]
longitude = row[6]
metrocode = row[7]
areacode = row[8]
vals = vals+sep+"(#{locid}, '#{countrycode}', '#{region}', #{MaxMindIsp.quote_value(city)}, '#{postalcode}', #{latitude}, #{longitude}, #{i(metrocode)}, '#{areacode}')"
sep = ','
i += 1
if count == 0 or i >= n then
self.connection.execute stmt+vals
count += i
vals = ''
sep = ''
i = 0
if ActiveRecord::Base.logger and ActiveRecord::Base.logger.level > 1 then
ActiveRecord::Base.logger.debug "... logging inserts into #{self.table_name} suspended ..."
ActiveRecord::Base.logger.level = 1
end
if ActiveRecord::Base.logger and count%10000 < n then
ActiveRecord::Base.logger.level = saved_level
ActiveRecord::Base.logger.debug "... inserted #{count} into #{self.table_name} ..."
ActiveRecord::Base.logger.level = 1
end
end
end
if i > 0 then
self.connection.execute stmt+vals
count += i
end
if ActiveRecord::Base.logger then
ActiveRecord::Base.logger.level = saved_level
ActiveRecord::Base.logger.debug "loaded #{count} records into #{self.table_name}"
end
sts = self.connection.execute "ALTER TABLE #{self.table_name} DROP COLUMN geog;"
ActiveRecord::Base.logger.debug "DROP COLUMN geog returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
# sts.check [we don't care]
sts = self.connection.execute "ALTER TABLE #{self.table_name} ADD COLUMN geog geography(point, 4326);"
ActiveRecord::Base.logger.debug "ADD COLUMN geog returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = self.connection.execute "UPDATE #{self.table_name} SET geog = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)::geography;"
ActiveRecord::Base.logger.debug "SET geog returned sts #{sts.cmd_tuples}" if ActiveRecord::Base.logger
sts.check
sts = self.connection.execute "CREATE INDEX #{self.table_name}_geog_gix ON #{self.table_name} USING GIST (geog);"
ActiveRecord::Base.logger.debug "CREATE INDEX #{self.table_name}_geog_gix returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = self.connection.execute "DELETE FROM #{CITIES_TABLE};"
ActiveRecord::Base.logger.debug "DELETE FROM #{CITIES_TABLE} returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = self.connection.execute "INSERT INTO #{CITIES_TABLE} (city, region, countrycode) SELECT DISTINCT city, region, countrycode FROM #{self.table_name} WHERE length(city) > 0 AND length(countrycode) > 0;"
ActiveRecord::Base.logger.debug "INSERT INTO #{CITIES_TABLE} returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = self.connection.execute "DELETE FROM #{REGIONS_TABLE};"
ActiveRecord::Base.logger.debug "DELETE FROM #{REGIONS_TABLE} returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = self.connection.execute "INSERT INTO #{REGIONS_TABLE} (region, countrycode) SELECT DISTINCT region, countrycode FROM #{CITIES_TABLE};"
ActiveRecord::Base.logger.debug "INSERT INTO #{REGIONS_TABLE} returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = self.connection.execute "DELETE FROM #{COUNTRIES_TABLE};"
ActiveRecord::Base.logger.debug "DELETE FROM #{COUNTRIES_TABLE} returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = self.connection.execute "INSERT INTO #{COUNTRIES_TABLE} (countrycode) SELECT DISTINCT countrycode FROM #{REGIONS_TABLE};"
ActiveRecord::Base.logger.debug "INSERT INTO #{COUNTRIES_TABLE} returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
end
end
end
end
end

View File

@ -4,6 +4,8 @@ module JamRuby
class JamIsp < ActiveRecord::Base
self.table_name = 'jamisp'
COMPANY_TABLE = 'jamcompany'
GEOIPISP_TABLE = 'geoipisp'
def self.ip_to_num(ip_addr)
begin
@ -24,7 +26,7 @@ module JamRuby
def self.createx(beginip, endip, coid)
c = connection.raw_connection
c.exec_params('insert into jamisp (beginip, endip, coid, geom) values($1::bigint, $2::bigint, $3, ST_MakeEnvelope($1::bigint, -1, $2::bigint, 1))',
c.exec_params("insert into #{self.table_name} (beginip, endip, coid, geom) values($1::bigint, $2::bigint, $3, ST_MakeEnvelope($1::bigint, -1, $2::bigint, 1))",
[beginip, endip, coid])
end
@ -37,7 +39,117 @@ module JamRuby
end
def self.import_from_max_mind(file)
# todo implement import_from_max_mind
# File Geo-124
# Format:
# startIpNum,endIpNum,isp
GeoIpLocations.transaction do
GeoIpLocations.delete_all
File.open(file, 'r:ISO-8859-1') do |io|
#s = io.gets.strip # eat the copyright line. gah, why do they have that in their file??
#unless s.eql? 'Copyright (c) 2012 MaxMind LLC. All Rights Reserved.'
# puts s
# puts 'Copyright (c) 2012 MaxMind LLC. All Rights Reserved.'
# raise 'file does not start with expected copyright (line 1): Copyright (c) 2012 MaxMind LLC. All Rights Reserved.'
#end
#s = io.gets.strip # eat the headers line
#unless s.eql? 'locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode'
# puts s
# puts 'locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode'
# raise 'file does not start with expected header (line 2): locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode'
#end
saved_level = ActiveRecord::Base.logger ? ActiveRecord::Base.logger.level : 0
count = 0
stmt = "insert into #{GEOIPISP_TABLE} (beginip, endip, company) values"
vals = ''
sep = ''
i = 0
n = 20
csv = ::CSV.new(io, {encoding: 'ISO-8859-1', headers: false})
csv.each do |row|
raise "file does not have expected number of columns (3): #{row.length}" unless row.length == 3
beginip = MaxMindIsp.ip_address_to_int(MaxMindIsp.strip_quotes(row[0]))
endip = MaxMindIsp.ip_address_to_int(MaxMindIsp.strip_quotes(row[1]))
company = row[2]
vals = vals+sep+"(#{beginip}, #{endip}, #{MaxMindIsp.quote_value(company)})"
sep = ','
i += 1
if count == 0 or i >= n then
GeoIpLocations.connection.execute stmt+vals
count += i
vals = ''
sep = ''
i = 0
if ActiveRecord::Base.logger and ActiveRecord::Base.logger.level > 1 then
ActiveRecord::Base.logger.debug "... logging inserts into #{GEOIPISP_TABLE} suspended ..."
ActiveRecord::Base.logger.level = 1
end
if ActiveRecord::Base.logger and count%10000 < n then
ActiveRecord::Base.logger.level = saved_level
ActiveRecord::Base.logger.debug "... inserted #{count} into #{GEOIPISP_TABLE} ..."
ActiveRecord::Base.logger.level = 1
end
end
end
if i > 0 then
GeoIpLocations.connection.execute stmt+vals
count += i
end
if ActiveRecord::Base.logger then
ActiveRecord::Base.logger.level = saved_level
ActiveRecord::Base.logger.debug "loaded #{count} records into #{GEOIPISP_TABLE}"
end
sts = GeoIpLocations.connection.execute "DELETE FROM #{COMPANY_TABLE};"
ActiveRecord::Base.logger.debug "DELETE FROM #{COMPANY_TABLE} returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = GeoIpLocations.connection.execute "ALTER SEQUENCE #{COMPANY_TABLE}_coid_seq RESTART WITH 1;"
ActiveRecord::Base.logger.debug "ALTER SEQUENCE #{COMPANY_TABLE}_coid_seq returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = GeoIpLocations.connection.execute "INSERT INTO #{COMPANY_TABLE} (company) SELECT DISTINCT company FROM #{GEOIPISP_TABLE} ORDER BY company;"
ActiveRecord::Base.logger.debug "INSERT INTO #{COMPANY_TABLE} returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = GeoIpLocations.connection.execute "DELETE FROM #{self.table_name};"
ActiveRecord::Base.logger.debug "DELETE FROM #{self.table_name} returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = GeoIpLocations.connection.execute "INSERT INTO #{self.table_name} (beginip, endip, coid) SELECT x.beginip, x.endip, y.coid FROM #{GEOIPISP_TABLE} x, #{COMPANY_TABLE} y WHERE x.company = y.company;"
ActiveRecord::Base.logger.debug "INSERT INTO #{self.table_name} returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = GeoIpLocations.connection.execute "ALTER TABLE #{self.table_name} DROP COLUMN geom;"
ActiveRecord::Base.logger.debug "DROP COLUMN geom returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
#sts.check [we don't care]
sts = GeoIpLocations.connection.execute "ALTER TABLE #{self.table_name} ADD COLUMN geom geometry(polygon);"
ActiveRecord::Base.logger.debug "ADD COLUMN geom returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
sts = GeoIpLocations.connection.execute "UPDATE #{self.table_name} SET geom = ST_MakeEnvelope(beginip, -1, endip, 1);"
ActiveRecord::Base.logger.debug "SET geom returned sts #{sts.cmd_tuples}" if ActiveRecord::Base.logger
sts.check
sts = GeoIpLocations.connection.execute "CREATE INDEX #{self.table_name}_geom_gix ON #{self.table_name} USING GIST (geom);"
ActiveRecord::Base.logger.debug "CREATE INDEX #{self.table_name}_geom_gix returned sts #{sts.cmd_status}" if ActiveRecord::Base.logger
sts.check
end
end
end
end
end

View File

@ -24,6 +24,15 @@ namespace :db do
JamIsp.import_from_max_mind ENV['file']
end
desc "Help"
task help: :environment do
puts "bundle exec rake db:import_maxmind_isp file=/path/to/GeoIPISP-142.csv"
puts "bundle exec rake db:import_maxmind_geo file=/path/to/GeoIPCity.csv"
puts "bundle exec rake db:import_geoip_blocks file=/path/to/GeoIPCity-134-Blocks.csv"
puts "bundle exec rake db:import_geoip_locations file=/path/to/GeoIPCity-134-Location.csv"
puts "bundle exec rake db:import_jam_isp file=/path/to/GeoIPISP.csv"
end
desc "Create a fake set of maxmind data"
task phony_maxmind: :environment do
MaxMindManager.active_record_transaction do |manager|