diff options
author | elijah <elijah@riseup.net> | 2016-05-25 15:15:07 -0700 |
---|---|---|
committer | elijah <elijah@riseup.net> | 2016-05-25 15:15:07 -0700 |
commit | c8550e3712ffed2fb9c93f6ca423b0a49849ad36 (patch) | |
tree | eb78997c1483abf5167efdec54bd99cc0cf6b8b0 |
initial commit
-rw-r--r-- | .gitignore | 4 | ||||
-rw-r--r-- | Gemfile | 4 | ||||
-rw-r--r-- | README.md | 60 | ||||
-rw-r--r-- | Rakefile | 12 | ||||
-rwxr-xr-x | bin/parse-email-logs | 170 | ||||
-rw-r--r-- | config/config.yml.example | 6 | ||||
-rw-r--r-- | config/initializer.rb | 18 | ||||
-rw-r--r-- | db/migrations/001_create_messages.rb | 32 |
8 files changed, 306 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..477d1e2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +Gemfile.lock +config/config.yml +db/email.sqlite3 +postfix.log.1
\ No newline at end of file @@ -0,0 +1,4 @@ +source 'https://rubygems.org' + +gem 'activerecord' +gem 'sqlite3' diff --git a/README.md b/README.md new file mode 100644 index 0000000..dd59a16 --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +GETTING STARTED +============================================ + +Install necessary gems: + + $ bundle + +Create a config file with the necessary secret: + + $ sed -e s/CHANGEME/$(pwgen -s 30)/ config/config.yml.example > config/config.yml + +USAGE +============================================ + + rake reset + cat postfix.log.1 | bin/parse-email-logs + +NOTES +============================================ + +encoded list sender +-------------------------------------------- + +The envelope "from" for mailing lists often encodes the recipient. For example: + + bounce-debian-backports=micah=debian.org@lists.debian.org + +This is an entry for the mailing list debian-backports@lists.debian.org +delivering mail to micah@debian.org. + +So, the data will appear to have many more unique envelope from addresses than +there really are. + +quota +-------------------------------------------- + +The way we have postfix configured, we reject messages for users who are over +quota very early on in the pipeline. By doing this, we radically reduce the +overhead that the mail servers have for dealing with users who are over quota. + +One consequence of this is that incoming messages to users who are over quota +never get a queue ID and will never show up in the dataset. + + +TODO +============================================ + +handle over quota errors? + + NOQUEUE: reject: RCPT from hotmail.com[0.0.0.0]: 450 4.7.1 <bob@riseup.net>: Recipient address rejected: Sorry, your message cannot be delivered to that person because their mailbox is full. If you can contact them another way, you may wish to tell them of this problem; from=<alice@hotmail.com> to=<bob@riseup.net> proto=ESMTP helo=<mx100.hotmail.com> + +what is "resent-message-id"? + + May 20 22:16:47 mx1 postfix/smtpd[23894]: 106FC1A1FCB: client=bendel.debian.org[0.0.0.0] + May 20 22:16:47 mx1 postfix/cleanup[21313]: 106FC1A1FCB: message-id=<20160520221607.GA5201@riseup.net> + May 20 22:16:47 mx1 postfix/cleanup[21313]: 106FC1A1FCB: resent-message-id=<36m1DyUlwO.A.vVC.Nz4PXB@bendel> + May 20 22:16:47 mx1 postfix/qmgr[5938]: 106FC1A1FCB: from=<bounce-debian-glibc=xxxx=debian.org@lists.debian.org>, size=32505, nrcpt=1 (queue active) + May 20 22:16:47 mx1 postfix/smtp[21920]: 106FC1A1FCB: to=<xxxx@riseup.net>, relay=0.0.0.0[0.0.0.0]:25, delay=1.1, delays=1.1/0/0.04/0.02, dsn=2.0.0, status=sent (250 2.0.0 Ok: queued as 87B333F0) + May 20 22:16:47 mx1 postfix/qmgr[5938]: 106FC1A1FCB: removed + diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..d57aeaa --- /dev/null +++ b/Rakefile @@ -0,0 +1,12 @@ +require_relative 'config/initializer.rb' + +desc "Migrate the database through scripts in db/migrate" +task :migrate do + ActiveRecord::Migrator.migrate("db/migrations/") +end + +desc "Reset the db and start over" +task :reset do + File.unlink(CONFIG['database']['database']) + ActiveRecord::Migrator.migrate("db/migrations/") +end diff --git a/bin/parse-email-logs b/bin/parse-email-logs new file mode 100755 index 0000000..2a24261 --- /dev/null +++ b/bin/parse-email-logs @@ -0,0 +1,170 @@ +#!/usr/bin/ruby + +require_relative '../config/initializer' + +class Message < ActiveRecord::Base + self.inheritance_column = 'disabled' +end + +$input = nil + +def parse_command_line + if ARGV.grep(/-h/).any? + usage + end + if ARGV[0] && File.exist?(ARGV[0]) + $input = File.open(ARGV[0]) + else + $input = ARGF + end +end + +def usage + puts "USAGE: " + puts " option 1: parse-email-logs [LOGFILE]" + puts " option 2: cat log | parse-email-logs" + exit(0) +end + +def parse_timestamp(str) + # e.g. May 20 20:17:14 + DateTime.strptime(str, "%b %d %H:%M:%S") +end + +def hash_addresses(str) + str.split(',').map {|address| + address = address.sub(/<.*>/, '') + address.split('@').map {|segment| + segment #Digest::HMAC.hexdigest(segment, CONFIG['secret'], Digest::MD5) + }.join('@') + }.join(',') +end + +def get_message(queue_id, timestamp) + msg = Message.find_or_create_by(queue_id: queue_id) + if msg.first_seen_at.nil? + putc '.'; STDOUT.flush() + msg.first_seen_at = parse_timestamp(timestamp) + msg.save + end + return msg +end + +# +# if we see this, then it was incoming: "relay=0.0.0.0[0.0.0.0]:25" +# +def do_sent(m, ts, matches) + if m.recipient.nil? + if matches['relay'] == "relay=0.0.0.0[0.0.0.0]:25" + m.is_outgoing = false + else + m.is_outgoing = true + end + if m.is_outgoing? + m.sent_at = m.first_seen_at + m.received_at = parse_timestamp(ts) + else + # sent_at will be set by the 'Date' header + m.received_at = m.first_seen_at + end + m.recipient = hash_addresses(matches["to"]) + m.orig_to = hash_addresses(matches["orig_to"]) if matches["orig_to"] + m.save + else + + end +end + +# +# save the message size and the envelope sender +# +def do_queue(m, ts, matches) + return if m.size != nil + m.size = matches['size'].to_i + m.sender = hash_addresses(matches['from']) + m.save +end + +# +# save the message id +# +def do_message_id(m, ts, matches, line) + return if m.message_id + m_id = matches['message_id'].gsub(/[<>]/,'').strip + m.message_id = hash_addresses(m_id) + m.save +end + +# +# the message was rejected, likely because milter scan thinks it is a virus. +# so we remove the record from the database +# +def do_purge_message(m, ts, matches) + m.destroy +end + +def do_error(line) + puts "ERROR: unmatched line!" + puts " " + line +end + +LINE_PARSE_MAP = { + 'postfix/smtp' => { + /to=<(?<to>.*?)>, (orig_to=<(?<orig_to>.*?)>, )?(?<relay>relay=.*?),.*status=sent/ => method(:do_sent), + /status=sent/ => :error + }, + 'postfix/qmgr' => { + /from=<(?<from>.*)>, size=(?<size>\d+),.*\(queue active\)/ => method(:do_queue), + /\(queue active\)/ => :error + }, + 'postfix/cleanup' => { + /message-id=(?<message_id>.*)$/ => method(:do_message_id), + /milter-reject/ => method(:do_purge_message), + // => :error + } +} + +def process_line(line) + splits = line.split(' ') + timestamp = splits[0..2].join(' ') + daemon = splits[4].split('[').first + queue_id = splits[5].sub(':', '') + message = splits[6..-1].join(' ') + LINE_PARSE_MAP.fetch(daemon, {}).each do |re, method| + match = re.match(line) + next unless match + if method == :error + do_error(line) + elsif !method.nil? + msg = get_message(queue_id, timestamp) + if method.arity == 3 + method.call(msg, timestamp, match) + else + method.call(msg, timestamp, match, line) + end + end + break + end +end + +def main + parse_command_line + start_time = Time.now + start_msg = Message.count + line_count = 0 + Message.transaction do + $input.each_line do |line| + process_line(line) + line_count += 1 + end + end + end_time = Time.now + end_msg = Message.count + puts + puts "FINISHED" + puts " Time: %s minutes" % ((end_time - start_time).to_i / 60) + puts "Records: %s" % (end_msg - start_msg) + puts " Lines: %s" % line_count +end + +main()
\ No newline at end of file diff --git a/config/config.yml.example b/config/config.yml.example new file mode 100644 index 0000000..e769005 --- /dev/null +++ b/config/config.yml.example @@ -0,0 +1,6 @@ +--- +database: + adapter: sqlite3 + database: db/email.sqlite3 +secret: CHANGEME + diff --git a/config/initializer.rb b/config/initializer.rb new file mode 100644 index 0000000..c8e5cca --- /dev/null +++ b/config/initializer.rb @@ -0,0 +1,18 @@ +ROOT = File.dirname(File.dirname(__FILE__)) +$: << ROOT + +require 'yaml' +require 'date' +require 'digest/hmac' +require 'active_record' +#require 'active_support' + +if ARGV.grep(/--debug/) + require 'byebug' +end + +CONFIG = YAML.load( + File.read(File.join(ROOT, 'config','config.yml')) +) + +ActiveRecord::Base.establish_connection(CONFIG['database']) diff --git a/db/migrations/001_create_messages.rb b/db/migrations/001_create_messages.rb new file mode 100644 index 0000000..9ff04e8 --- /dev/null +++ b/db/migrations/001_create_messages.rb @@ -0,0 +1,32 @@ + +class CreateMessages < ActiveRecord::Migration + def up + create_table :messages do |t| + t.string :queue_id + t.string :message_id + t.datetime :first_seen_at + t.datetime :sent_at + t.datetime :received_at + t.datetime :date + t.string :sender + t.string :recipient + t.string :orig_to + t.string :from + t.text :to + t.text :cc + t.text :bcc + t.integer :size + t.integer :spam_score + t.integer :subject_length + t.boolean :is_list, :default => false + t.boolean :is_outgoing, :default => nil + t.string :re_message_id, :default => nil + end + add_index :messages, :message_id + end + + def down + drop_table :messages + end +end + |