Skip to content

Commit

Permalink
chore: Improve email parsing using email trimmer gem (#3611)
Browse files Browse the repository at this point in the history
Email parsing using email_trimmer gem

Fixes: #3539 , #2954, #3572
  • Loading branch information
tejaswinichile committed Dec 22, 2021
1 parent 009abc1 commit 44486fc
Show file tree
Hide file tree
Showing 9 changed files with 1,232 additions and 32 deletions.
4 changes: 4 additions & 0 deletions Gemfile
Expand Up @@ -121,6 +121,10 @@ gem 'hairtrigger'

gem 'procore-sift'

# parse email
gem 'email_reply_trimmer'
gem 'html2text'

group :production, :staging do
# we dont want request timing out in development while using byebug
gem 'rack-timeout'
Expand Down
5 changes: 5 additions & 0 deletions Gemfile.lock
Expand Up @@ -179,6 +179,7 @@ GEM
addressable (~> 2.8)
ecma-re-validator (0.3.0)
regexp_parser (~> 2.0)
email_reply_trimmer (0.1.13)
erubi (1.10.0)
erubis (2.7.0)
et-orbi (1.2.5)
Expand Down Expand Up @@ -290,6 +291,8 @@ GEM
hashdiff (1.0.1)
hashie (4.1.0)
hkdf (0.3.0)
html2text (0.2.1)
nokogiri (~> 1.6)
http-accept (1.7.0)
http-cookie (1.0.4)
domain_name (~> 0.5)
Expand Down Expand Up @@ -668,6 +671,7 @@ DEPENDENCIES
devise_token_auth
dotenv-rails
down (~> 5.0)
email_reply_trimmer
facebook-messenger
factory_bot_rails
faker
Expand All @@ -682,6 +686,7 @@ DEPENDENCIES
haikunator
hairtrigger
hashie
html2text
image_processing
jbuilder
json_refs
Expand Down
31 changes: 31 additions & 0 deletions app/presenters/html_parser.rb
@@ -0,0 +1,31 @@
class HtmlParser
def self.parse_reply(raw_body)
new(raw_body).filtered_text
end

attr_reader :raw_body

def initialize(raw_body)
@raw_body = raw_body
end

def document
@document ||= Nokogiri::HTML(raw_body)
end

def filter_replies!
document.xpath('//blockquote').each { |n| n.replace('> ') }
document.xpath('//table').each(&:remove)
end

def filtered_html
@filtered_html ||= begin
filter_replies!
document.inner_html
end
end

def filtered_text
@filtered_text ||= Html2Text.convert(filtered_html)
end
end
61 changes: 29 additions & 32 deletions app/presenters/mail_presenter.rb
Expand Up @@ -8,30 +8,48 @@ def initialize(mail, account = nil)
end

def subject
encode_to_unicode(@mail.subject || '')
encode_to_unicode(@mail.subject)
end

def text_content
@decoded_text_content ||= encode_to_unicode(text_part&.decoded || decoded_message || '')
@decoded_text_content = select_body || ''
encoding = @decoded_text_content.encoding

body = EmailReplyTrimmer.trim(@decoded_text_content)

return {} if @decoded_text_content.blank?

@text_content ||= {
full: @decoded_text_content,
reply: extract_reply(@decoded_text_content)[:reply],
quoted: extract_reply(@decoded_text_content)[:quoted_text]
full: select_body,
reply: @decoded_text_content,
quoted: body.force_encoding(encoding).encode('UTF-8')
}
end

def select_body
message = mail.text_part || mail.html_part || mail
decoded = encode_to_unicode(message.decoded)
# Certain trigger phrases that means we didn't parse correctly
return '' if %r{(Content-Type: multipart/alternative|text/plain)}.match?(decoded)

if (mail.content_type || '').include? 'text/html'
::HtmlParser.parse_reply(decoded)
else
decoded
end
end

def html_content
@decoded_html_content ||= encode_to_unicode(html_part&.decoded)
@decoded_html_content = select_body || ''

return {} if @decoded_html_content.blank?

body = EmailReplyTrimmer.trim(@decoded_html_content)

@html_content ||= {
full: @decoded_html_content,
reply: extract_reply(@decoded_html_content)[:reply],
quoted: extract_reply(@decoded_html_content)[:quoted_text]
full: select_body,
reply: @decoded_html_content,
quoted: body
}
end

Expand All @@ -47,14 +65,6 @@ def attachments
end
end

def decoded_message
if mail.multipart?
return mail.text_part ? mail.text_part.decoded : nil
end

mail.decoded
end

def number_of_attachments
mail.attachments.count
end
Expand Down Expand Up @@ -114,21 +124,8 @@ def encode_to_unicode(str)
return str if current_encoding == 'UTF-8'

str.encode(current_encoding, 'UTF-8', invalid: :replace, undef: :replace, replace: '?')
end

def extract_reply(content)
@regex_arr ||= quoted_text_regexes

content_length = content.length
# calculates the matching regex closest to top of page
index = @regex_arr.inject(content_length) do |min, regex|
[(content.index(regex) || content_length), min].min
end

{
reply: content[0..(index - 1)].strip,
quoted_text: content[index..].strip
}
rescue StandardError
''
end

def quoted_text_regexes
Expand Down
47 changes: 47 additions & 0 deletions spec/fixtures/files/mail_with_quote.eml
@@ -0,0 +1,47 @@
MIME-Version: 1.0
Date: Thu, 19 Aug 2021 14:14:31 +0530
References: <CAFkiBVxGoURoqdkY-O_25F-8b41kb-GWBc6hh4Djd5ynwOikXA@mail.gmail.com> <0100017b5d8efc70-c7f18809-aa55-48f6-91fd-b626092ed8b3-000000@email.amazonses.com>
In-Reply-To: <0100017b5d8efc70-c7f18809-aa55-48f6-91fd-b626092ed8b3-000000@email.amazonses.com>
Message-ID: <CAFkiBVwJjO_k_e-LpiKi7MAQAKbHX5nkEPcf0y1R=bjcEHogMg@mail.gmail.com>
Subject: Re: Checking mail forwarding to cw inbox
From: Sony Mathew <sony@chatwoot.com>
To: Tejaswini <reply+6bdc3f4d-0bec-4515-a284-5d916fdde489@example.com>
Content-Type: multipart/alternative; boundary="0000000000004af64505c9e58f03"

--0000000000004af64505c9e58f03
Content-Type: text/plain; charset="UTF-8"
Yes, I am providing you step how to reproduce this issue
On Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test <
tejaswini@chatwoot.com> wrote:
> Any update on this?
>
>
--
* Sony Mathew*
Software developer
*Mob:9999999999

--0000000000004af64505c9e58f03
Content-Type: text/html; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable

<div dir=3D"ltr">Yes, I am providing you step how to reproduce this issue</=
div><br><div class=3D"gmail_quote"><div dir=3D"ltr" class=3D"gmail_attr">On=
Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test &l=
t;<a href=3D"mailto:tejaswini@chatwoot.com">tejaswini@chatwoot.com</a>&gt; wrot=
e:<br></div><blockquote class=3D"gmail_quote" style=3D"margin:0px 0px 0px 0=
.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> <p>
</p><p>Any update on this?</p>

<p></p>
</blockquote></div><br clear=3D"all"><div><br></div>-- <br><div dir=3D"ltr"=
class=3D"gmail_signature"><div dir=3D"ltr"><div><div dir=3D"ltr"><div><div=
><b>Sony Mathew.</b><br></div><span style=3D"font-family:&quot;times ne=
w roman&quot;,serif"><span></span><span></span>Software developer</span><br=
></div><b>Mob:9999999999</b></div></div></div></div>

--0000000000004af64505c9e58f03--

0 comments on commit 44486fc

Please sign in to comment.