summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorelijah <elijah@riseup.net>2016-05-25 15:15:07 -0700
committerelijah <elijah@riseup.net>2016-05-25 15:15:07 -0700
commitc8550e3712ffed2fb9c93f6ca423b0a49849ad36 (patch)
treeeb78997c1483abf5167efdec54bd99cc0cf6b8b0
initial commit
-rw-r--r--.gitignore4
-rw-r--r--Gemfile4
-rw-r--r--README.md60
-rw-r--r--Rakefile12
-rwxr-xr-xbin/parse-email-logs170
-rw-r--r--config/config.yml.example6
-rw-r--r--config/initializer.rb18
-rw-r--r--db/migrations/001_create_messages.rb32
8 files changed, 306 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..477d1e2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+Gemfile.lock
+config/config.yml
+db/email.sqlite3
+postfix.log.1 \ No newline at end of file
diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..ff7c014
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+
+gem 'activerecord'
+gem 'sqlite3'
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..dd59a16
--- /dev/null
+++ b/README.md
@@ -0,0 +1,60 @@
+GETTING STARTED
+============================================
+
+Install necessary gems:
+
+ $ bundle
+
+Create a config file with the necessary secret:
+
+ $ sed -e s/CHANGEME/$(pwgen -s 30)/ config/config.yml.example > config/config.yml
+
+USAGE
+============================================
+
+ rake reset
+ cat postfix.log.1 | bin/parse-email-logs
+
+NOTES
+============================================
+
+encoded list sender
+--------------------------------------------
+
+The envelope "from" for mailing lists often encodes the recipient. For example:
+
+ bounce-debian-backports=micah=debian.org@lists.debian.org
+
+This is an entry for the mailing list debian-backports@lists.debian.org
+delivering mail to micah@debian.org.
+
+So, the data will appear to have many more unique envelope from addresses than
+there really are.
+
+quota
+--------------------------------------------
+
+The way we have postfix configured, we reject messages for users who are over
+quota very early on in the pipeline. By doing this, we radically reduce the
+overhead that the mail servers have for dealing with users who are over quota.
+
+One consequence of this is that incoming messages to users who are over quota
+never get a queue ID and will never show up in the dataset.
+
+
+TODO
+============================================
+
+handle over quota errors?
+
+ NOQUEUE: reject: RCPT from hotmail.com[0.0.0.0]: 450 4.7.1 <bob@riseup.net>: Recipient address rejected: Sorry, your message cannot be delivered to that person because their mailbox is full. If you can contact them another way, you may wish to tell them of this problem; from=<alice@hotmail.com> to=<bob@riseup.net> proto=ESMTP helo=<mx100.hotmail.com>
+
+what is "resent-message-id"?
+
+ May 20 22:16:47 mx1 postfix/smtpd[23894]: 106FC1A1FCB: client=bendel.debian.org[0.0.0.0]
+ May 20 22:16:47 mx1 postfix/cleanup[21313]: 106FC1A1FCB: message-id=<20160520221607.GA5201@riseup.net>
+ May 20 22:16:47 mx1 postfix/cleanup[21313]: 106FC1A1FCB: resent-message-id=<36m1DyUlwO.A.vVC.Nz4PXB@bendel>
+ May 20 22:16:47 mx1 postfix/qmgr[5938]: 106FC1A1FCB: from=<bounce-debian-glibc=xxxx=debian.org@lists.debian.org>, size=32505, nrcpt=1 (queue active)
+ May 20 22:16:47 mx1 postfix/smtp[21920]: 106FC1A1FCB: to=<xxxx@riseup.net>, relay=0.0.0.0[0.0.0.0]:25, delay=1.1, delays=1.1/0/0.04/0.02, dsn=2.0.0, status=sent (250 2.0.0 Ok: queued as 87B333F0)
+ May 20 22:16:47 mx1 postfix/qmgr[5938]: 106FC1A1FCB: removed
+
diff --git a/Rakefile b/Rakefile
new file mode 100644
index 0000000..d57aeaa
--- /dev/null
+++ b/Rakefile
@@ -0,0 +1,12 @@
+require_relative 'config/initializer.rb'
+
+desc "Migrate the database through scripts in db/migrate"
+task :migrate do
+ ActiveRecord::Migrator.migrate("db/migrations/")
+end
+
+desc "Reset the db and start over"
+task :reset do
+ File.unlink(CONFIG['database']['database'])
+ ActiveRecord::Migrator.migrate("db/migrations/")
+end
diff --git a/bin/parse-email-logs b/bin/parse-email-logs
new file mode 100755
index 0000000..2a24261
--- /dev/null
+++ b/bin/parse-email-logs
@@ -0,0 +1,170 @@
+#!/usr/bin/ruby
+
+require_relative '../config/initializer'
+
+class Message < ActiveRecord::Base
+ self.inheritance_column = 'disabled'
+end
+
+$input = nil
+
+def parse_command_line
+ if ARGV.grep(/-h/).any?
+ usage
+ end
+ if ARGV[0] && File.exist?(ARGV[0])
+ $input = File.open(ARGV[0])
+ else
+ $input = ARGF
+ end
+end
+
+def usage
+ puts "USAGE: "
+ puts " option 1: parse-email-logs [LOGFILE]"
+ puts " option 2: cat log | parse-email-logs"
+ exit(0)
+end
+
+def parse_timestamp(str)
+ # e.g. May 20 20:17:14
+ DateTime.strptime(str, "%b %d %H:%M:%S")
+end
+
+def hash_addresses(str)
+ str.split(',').map {|address|
+ address = address.sub(/<.*>/, '')
+ address.split('@').map {|segment|
+ segment #Digest::HMAC.hexdigest(segment, CONFIG['secret'], Digest::MD5)
+ }.join('@')
+ }.join(',')
+end
+
+def get_message(queue_id, timestamp)
+ msg = Message.find_or_create_by(queue_id: queue_id)
+ if msg.first_seen_at.nil?
+ putc '.'; STDOUT.flush()
+ msg.first_seen_at = parse_timestamp(timestamp)
+ msg.save
+ end
+ return msg
+end
+
+#
+# if we see this, then it was incoming: "relay=0.0.0.0[0.0.0.0]:25"
+#
+def do_sent(m, ts, matches)
+ if m.recipient.nil?
+ if matches['relay'] == "relay=0.0.0.0[0.0.0.0]:25"
+ m.is_outgoing = false
+ else
+ m.is_outgoing = true
+ end
+ if m.is_outgoing?
+ m.sent_at = m.first_seen_at
+ m.received_at = parse_timestamp(ts)
+ else
+ # sent_at will be set by the 'Date' header
+ m.received_at = m.first_seen_at
+ end
+ m.recipient = hash_addresses(matches["to"])
+ m.orig_to = hash_addresses(matches["orig_to"]) if matches["orig_to"]
+ m.save
+ else
+
+ end
+end
+
+#
+# save the message size and the envelope sender
+#
+def do_queue(m, ts, matches)
+ return if m.size != nil
+ m.size = matches['size'].to_i
+ m.sender = hash_addresses(matches['from'])
+ m.save
+end
+
+#
+# save the message id
+#
+def do_message_id(m, ts, matches, line)
+ return if m.message_id
+ m_id = matches['message_id'].gsub(/[<>]/,'').strip
+ m.message_id = hash_addresses(m_id)
+ m.save
+end
+
+#
+# the message was rejected, likely because milter scan thinks it is a virus.
+# so we remove the record from the database
+#
+def do_purge_message(m, ts, matches)
+ m.destroy
+end
+
+def do_error(line)
+ puts "ERROR: unmatched line!"
+ puts " " + line
+end
+
+LINE_PARSE_MAP = {
+ 'postfix/smtp' => {
+ /to=<(?<to>.*?)>, (orig_to=<(?<orig_to>.*?)>, )?(?<relay>relay=.*?),.*status=sent/ => method(:do_sent),
+ /status=sent/ => :error
+ },
+ 'postfix/qmgr' => {
+ /from=<(?<from>.*)>, size=(?<size>\d+),.*\(queue active\)/ => method(:do_queue),
+ /\(queue active\)/ => :error
+ },
+ 'postfix/cleanup' => {
+ /message-id=(?<message_id>.*)$/ => method(:do_message_id),
+ /milter-reject/ => method(:do_purge_message),
+ // => :error
+ }
+}
+
+def process_line(line)
+ splits = line.split(' ')
+ timestamp = splits[0..2].join(' ')
+ daemon = splits[4].split('[').first
+ queue_id = splits[5].sub(':', '')
+ message = splits[6..-1].join(' ')
+ LINE_PARSE_MAP.fetch(daemon, {}).each do |re, method|
+ match = re.match(line)
+ next unless match
+ if method == :error
+ do_error(line)
+ elsif !method.nil?
+ msg = get_message(queue_id, timestamp)
+ if method.arity == 3
+ method.call(msg, timestamp, match)
+ else
+ method.call(msg, timestamp, match, line)
+ end
+ end
+ break
+ end
+end
+
+def main
+ parse_command_line
+ start_time = Time.now
+ start_msg = Message.count
+ line_count = 0
+ Message.transaction do
+ $input.each_line do |line|
+ process_line(line)
+ line_count += 1
+ end
+ end
+ end_time = Time.now
+ end_msg = Message.count
+ puts
+ puts "FINISHED"
+ puts " Time: %s minutes" % ((end_time - start_time).to_i / 60)
+ puts "Records: %s" % (end_msg - start_msg)
+ puts " Lines: %s" % line_count
+end
+
+main() \ No newline at end of file
diff --git a/config/config.yml.example b/config/config.yml.example
new file mode 100644
index 0000000..e769005
--- /dev/null
+++ b/config/config.yml.example
@@ -0,0 +1,6 @@
+---
+database:
+ adapter: sqlite3
+ database: db/email.sqlite3
+secret: CHANGEME
+
diff --git a/config/initializer.rb b/config/initializer.rb
new file mode 100644
index 0000000..c8e5cca
--- /dev/null
+++ b/config/initializer.rb
@@ -0,0 +1,18 @@
+ROOT = File.dirname(File.dirname(__FILE__))
+$: << ROOT
+
+require 'yaml'
+require 'date'
+require 'digest/hmac'
+require 'active_record'
+#require 'active_support'
+
+if ARGV.grep(/--debug/)
+ require 'byebug'
+end
+
+CONFIG = YAML.load(
+ File.read(File.join(ROOT, 'config','config.yml'))
+)
+
+ActiveRecord::Base.establish_connection(CONFIG['database'])
diff --git a/db/migrations/001_create_messages.rb b/db/migrations/001_create_messages.rb
new file mode 100644
index 0000000..9ff04e8
--- /dev/null
+++ b/db/migrations/001_create_messages.rb
@@ -0,0 +1,32 @@
+
+class CreateMessages < ActiveRecord::Migration
+ def up
+ create_table :messages do |t|
+ t.string :queue_id
+ t.string :message_id
+ t.datetime :first_seen_at
+ t.datetime :sent_at
+ t.datetime :received_at
+ t.datetime :date
+ t.string :sender
+ t.string :recipient
+ t.string :orig_to
+ t.string :from
+ t.text :to
+ t.text :cc
+ t.text :bcc
+ t.integer :size
+ t.integer :spam_score
+ t.integer :subject_length
+ t.boolean :is_list, :default => false
+ t.boolean :is_outgoing, :default => nil
+ t.string :re_message_id, :default => nil
+ end
+ add_index :messages, :message_id
+ end
+
+ def down
+ drop_table :messages
+ end
+end
+