diff --git a/Gemfile b/Gemfile index bca6463d5fc2..f3fbce89d389 100644 --- a/Gemfile +++ b/Gemfile @@ -121,6 +121,10 @@ gem 'hairtrigger' gem 'procore-sift' +# parse email +gem 'email_reply_trimmer' +gem 'html2text' + group :production, :staging do # we dont want request timing out in development while using byebug gem 'rack-timeout' diff --git a/Gemfile.lock b/Gemfile.lock index 9e90995a1992..b61ef372cccc 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -179,6 +179,7 @@ GEM addressable (~> 2.8) ecma-re-validator (0.3.0) regexp_parser (~> 2.0) + email_reply_trimmer (0.1.13) erubi (1.10.0) erubis (2.7.0) et-orbi (1.2.5) @@ -290,6 +291,8 @@ GEM hashdiff (1.0.1) hashie (4.1.0) hkdf (0.3.0) + html2text (0.2.1) + nokogiri (~> 1.6) http-accept (1.7.0) http-cookie (1.0.4) domain_name (~> 0.5) @@ -668,6 +671,7 @@ DEPENDENCIES devise_token_auth dotenv-rails down (~> 5.0) + email_reply_trimmer facebook-messenger factory_bot_rails faker @@ -682,6 +686,7 @@ DEPENDENCIES haikunator hairtrigger hashie + html2text image_processing jbuilder json_refs diff --git a/app/presenters/html_parser.rb b/app/presenters/html_parser.rb new file mode 100644 index 000000000000..716e1a69acbc --- /dev/null +++ b/app/presenters/html_parser.rb @@ -0,0 +1,31 @@ +class HtmlParser + def self.parse_reply(raw_body) + new(raw_body).filtered_text + end + + attr_reader :raw_body + + def initialize(raw_body) + @raw_body = raw_body + end + + def document + @document ||= Nokogiri::HTML(raw_body) + end + + def filter_replies! + document.xpath('//blockquote').each { |n| n.replace('> ') } + document.xpath('//table').each(&:remove) + end + + def filtered_html + @filtered_html ||= begin + filter_replies! + document.inner_html + end + end + + def filtered_text + @filtered_text ||= Html2Text.convert(filtered_html) + end +end diff --git a/app/presenters/mail_presenter.rb b/app/presenters/mail_presenter.rb index 6bb438cda3c8..dc6ce78d29ca 100644 --- a/app/presenters/mail_presenter.rb +++ b/app/presenters/mail_presenter.rb @@ -8,30 +8,48 @@ def initialize(mail, account = nil) end def subject - encode_to_unicode(@mail.subject || '') + encode_to_unicode(@mail.subject) end def text_content - @decoded_text_content ||= encode_to_unicode(text_part&.decoded || decoded_message || '') + @decoded_text_content = select_body || '' + encoding = @decoded_text_content.encoding + + body = EmailReplyTrimmer.trim(@decoded_text_content) return {} if @decoded_text_content.blank? @text_content ||= { - full: @decoded_text_content, - reply: extract_reply(@decoded_text_content)[:reply], - quoted: extract_reply(@decoded_text_content)[:quoted_text] + full: select_body, + reply: @decoded_text_content, + quoted: body.force_encoding(encoding).encode('UTF-8') } end + def select_body + message = mail.text_part || mail.html_part || mail + decoded = encode_to_unicode(message.decoded) + # Certain trigger phrases that means we didn't parse correctly + return '' if %r{(Content-Type: multipart/alternative|text/plain)}.match?(decoded) + + if (mail.content_type || '').include? 'text/html' + ::HtmlParser.parse_reply(decoded) + else + decoded + end + end + def html_content - @decoded_html_content ||= encode_to_unicode(html_part&.decoded) + @decoded_html_content = select_body || '' return {} if @decoded_html_content.blank? + body = EmailReplyTrimmer.trim(@decoded_html_content) + @html_content ||= { - full: @decoded_html_content, - reply: extract_reply(@decoded_html_content)[:reply], - quoted: extract_reply(@decoded_html_content)[:quoted_text] + full: select_body, + reply: @decoded_html_content, + quoted: body } end @@ -47,14 +65,6 @@ def attachments end end - def decoded_message - if mail.multipart? - return mail.text_part ? mail.text_part.decoded : nil - end - - mail.decoded - end - def number_of_attachments mail.attachments.count end @@ -114,21 +124,8 @@ def encode_to_unicode(str) return str if current_encoding == 'UTF-8' str.encode(current_encoding, 'UTF-8', invalid: :replace, undef: :replace, replace: '?') - end - - def extract_reply(content) - @regex_arr ||= quoted_text_regexes - - content_length = content.length - # calculates the matching regex closest to top of page - index = @regex_arr.inject(content_length) do |min, regex| - [(content.index(regex) || content_length), min].min - end - - { - reply: content[0..(index - 1)].strip, - quoted_text: content[index..].strip - } + rescue StandardError + '' end def quoted_text_regexes diff --git a/spec/fixtures/files/mail_with_quote.eml b/spec/fixtures/files/mail_with_quote.eml new file mode 100644 index 000000000000..2019dd6928b8 --- /dev/null +++ b/spec/fixtures/files/mail_with_quote.eml @@ -0,0 +1,47 @@ +MIME-Version: 1.0 +Date: Thu, 19 Aug 2021 14:14:31 +0530 +References: <0100017b5d8efc70-c7f18809-aa55-48f6-91fd-b626092ed8b3-000000@email.amazonses.com> +In-Reply-To: <0100017b5d8efc70-c7f18809-aa55-48f6-91fd-b626092ed8b3-000000@email.amazonses.com> +Message-ID: +Subject: Re: Checking mail forwarding to cw inbox +From: Sony Mathew +To: Tejaswini +Content-Type: multipart/alternative; boundary="0000000000004af64505c9e58f03" + +--0000000000004af64505c9e58f03 +Content-Type: text/plain; charset="UTF-8" + +Yes, I am providing you step how to reproduce this issue + +On Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test < +tejaswini@chatwoot.com> wrote: + +> Any update on this? +> +> + +-- +* Sony Mathew* +Software developer +*Mob:9999999999 + +--0000000000004af64505c9e58f03 +Content-Type: text/html; charset="UTF-8" +Content-Transfer-Encoding: quoted-printable + +
Yes, I am providing you step how to reproduce this issue
On= + Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test &l= +t;tejaswini@chatwoot.com> wrot= +e:

+

Any update on this?

+ +

+


--
Sony Mathew.
Software developer
Mob:9999999999
+ +--0000000000004af64505c9e58f03-- diff --git a/spec/fixtures/files/welcome_html.eml b/spec/fixtures/files/welcome_html.eml new file mode 100644 index 000000000000..729f533cb609 --- /dev/null +++ b/spec/fixtures/files/welcome_html.eml @@ -0,0 +1,1061 @@ +Delivered-To: tejaswini@chatwoot.com +Received: by 2002:a05:7110:a0a6:b0:11b:be8b:591e with SMTP id o6csp4217866gee; + Mon, 20 Dec 2021 22:29:33 -0800 (PST) +X-Received: by 2002:a05:6638:38a0:: with SMTP id b32mr1050456jav.200.1640068173387; + Mon, 20 Dec 2021 22:29:33 -0800 (PST) +ARC-Seal: i=1; a=rsa-sha256; t=1640068173; cv=none; + d=google.com; s=arc-20160816; + b=TEtBl28WlEfNA6zQhv0ABl/8Xlo51uARUwc86zoaCZtl6VhLOhBCiKprg6RYeIddU2 + vuk9jJ/a5UbSGbAHm7sul59AtWpBgG70ncaxgoDbzCcmIvF3eiVmqL/86mLcdg8iw7qv + y5G0cGGAS90QmpHGI1NpbrqsaMSYGNxdBAbKEc8MptMNX6YMRtxkBHys2RytYZnAKDMK + 6Z7/Rw5dTqTNOsEAAIpEuWt+OkUFueYUUxzQWjvzoKJLxeXwaEcoDA7zNvgcfX1Vi0gt + 2ACeODiEw32nf8OFHOzL1GHZ1KORR0k/EBa5JiA3yuyzO2T6E6x8eMLXN0RE8Sz510vB + 798w== +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; + h=to:subject:message-id:date:from:in-reply-to:references:mime-version + :dkim-signature; + bh=INv+/+ALDvIppLZlFlmI+SEvYemXp/p7klJURJIMI7U=; + b=xaXEv4tduwW8acrwdMgVs/zoy/LFkjVa6E37T+DWjKmxh8ded438RPxOKfASoIUd/X + 3hZu8ZHoVjuaUNYt2YfnD9svbO9lbMc2nlZr8mfop8RenXkJaPqt4q3sNXWItgNdLv5b + MmiPtCRb2oxq3u7VR7clEu0SAdvdett5AqwEFaZEMsT6yMoDdAyKRg3XyyB9c43/91kY + +MZSbDHv7OI3FZCgYlhCRkRWhjbLOi4mSqAnuEAS8Iwjmcn9pIaNcgkDhrHm8wp2SpDQ + GRVxyacJ/pTLS4azOPZOQ0zDslevdf/dCyKvCePv2QA+eMkCK8gLquQRQoss2B4uej36 + AuXg== +ARC-Authentication-Results: i=1; mx.google.com; + dkim=pass header.i=@gmail.com header.s=20210112 header.b=CGxGg1rV; + spf=pass (google.com: domain of Sony@chatwoot.com designates 209.85.220.41 as permitted sender) smtp.mailfrom=Sony@chatwoot.com; + dmarc=pass (p=NONE sp=QUARANTINE dis=NONE) header.from=gmail.com +Return-Path: +Received: from mail-sor-f41.google.com (mail-sor-f41.google.com. [209.85.220.41]) + by mx.google.com with SMTPS id i11sor5238056ilu.143.2021.12.20.22.29.33 + for + (Google Transport Security); + Mon, 20 Dec 2021 22:29:33 -0800 (PST) +Received-SPF: pass (google.com: domain of Sony@chatwoot.com designates 209.85.220.41 as permitted sender) client-ip=209.85.220.41; +Authentication-Results: mx.google.com; + dkim=pass header.i=@gmail.com header.s=20210112 header.b=CGxGg1rV; + spf=pass (google.com: domain of Sony@chatwoot.com designates 209.85.220.41 as permitted sender) smtp.mailfrom=Sony@chatwoot.com; + dmarc=pass (p=NONE sp=QUARANTINE dis=NONE) header.from=gmail.com +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=gmail.com; s=20210112; + h=mime-version:references:in-reply-to:from:date:message-id:subject:to; + bh=INv+/+ALDvIppLZlFlmI+SEvYemXp/p7klJURJIMI7U=; + b=CGxGg1rVQBWFB2A0GDOx2F9M6C/QkZayZTseFV0GEsO/Ok+7KsEA+mOHZdUOXqTEfw + WXwE9r9B5JxpYMq1u6a79LHQTV4kTNbAVmDO3fJP3h+GM0LW+szDkfzOW2waDri6CtD9 + 9hk940kOJhl19cBrLcrGOE9wNvwP6yYAJxh5ZVtgV0pUhUEoHOZfDNI22kGeEXzwco2T + ZfdSoWxmp75Hg7rIgJBLMkFN9/2qtPLy0aEcvAEsbE+c9vmIDaqvHegvOiSC+mDOtdo0 + OdJZJzLraA7ZYGxHmfBWVYq/9Ros1Q15A2L5aSdKJLvQgRQl26/vO+Bht1ZqyH22et+r + oIiw== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:mime-version:references:in-reply-to:from:date + :message-id:subject:to; + bh=INv+/+ALDvIppLZlFlmI+SEvYemXp/p7klJURJIMI7U=; + b=XEJl7va2s8ZPhCO1d+h4a217L/ByFYd8DHqr6oPMiNhkj/kiHzOLKQuvwOQb66nIXq + IRcLnzzlsoGe3POF+mAWvCHV0fNzMm6qHPGG8QzGXAx4QRiWsJI+0xNhQVmfGKj0/2/c + gvcyIfPJisv229IR3pZXh4mpc9yqZYDIbp1OK90ROmBSBG9uKL5FVhdc7AxbM9zy59vz + 5QjuI0FZlIlq7rJzyTrsf1wMUtSRd2e3aRzg1luChyH57+kSDaEO8Qrfw/6+c63/ljQw + Ou00pgxPi6BStcD5ZyD+qHQ5XrmGX5GERitxA5bWa5JyC41pZsx3sQNkE2iT8nktJr+j + I7rA== +X-Gm-Message-State: AOAM531I6Run21O8HiwUKEvIBbxxAhubJ3AyHQHSvq0u84taQpvrSh9+ h7itpdokByTLuqgMSj7/qr6R0KouqD0f8RVGiQiKS3xkZfw= +X-Google-Smtp-Source: ABdhPJyaWpa+kXiDLdmNH4G5TUlfMxiwd1kgrcVXB6LB3QvhdnFkKm/tbQPzzhbutpnAOPWu+oXKU2Pj/hVH2XoPsoc= +X-Received: by 2002:a92:db51:: with SMTP id w17mr849806ilq.213.1640068172521; Mon, 20 Dec 2021 22:29:32 -0800 (PST) +MIME-Version: 1.0 +References: +In-Reply-To: +From: Sony Mathew +Date: Tue, 21 Dec 2021 11:29:20 +0500 +Message-ID: +Subject: Fwd: How good are you in English? How did you improve your English? +To: tejaswini@chatwoot.com +Content-Type: multipart/alternative; boundary="000000000000d4a5b005d3a220ef" + +--000000000000d4a5b005d3a220ef +Content-Type: text/plain; charset="UTF-8" +Content-Transfer-Encoding: quoted-printable + +I'm learning English as a first language for the past 13 years, but to be +honest, my English was drastically improved only in the past 3 years. I did +my kindergarten in an ICSC school in Madura=E2=80=A6 +[image: test] + + +Test's Digest +Top Stories For You + +How good are you in English? How did you improve your English? + + +*Nakul Sethuram +*, +Student =E2=80=A2 Updated March 26 + + +I'm learning English as a first language for the past 13 years, but to be +honest, my English was drastically improved only in the past 3 years. + +I did my kindergarten in an I... + +Read More =C2=BB + +[image: View] + + +7.2M + +[image: +Upvote] + + +770 + +[image: +Comment] + + +93 + + +A new software developer spent 4 days to make a simple HTML button and CSS, +should I fire him? + + +*Gee Fishel +*, +20+ years of software development, from gaming to government to the +enterprise. =E2=80=A2 Answered December 6, 2017 + + +Maybe, but probably not. + +First it depends on what you mean by =E2=80=9Cmake a simple HTML button and= + CSS.=E2=80=9D + +If it took him 4 days to create a new html page with just a simple button +a... + +Read More =C2=BB + +[image: View] + + +919.5K + +[image: +Upvote] + + +5.2K + +[image: +Comment] + + +133 + + +How do I get great Instagram followers? + + +Mary Parr + +=E2=80=A2 Answered April 23 + + +Here are the most popular ways to get Instagram followers: + + 1. Join mutual PR groups + +To quickly gain the first 1k followers, join mutual PR groups. The +advantage of this method i... + +Read More =C2=BB + +[image: View] + + +348.3K + +[image: +Upvote] + + +25 + +[image: +Comment] + + +1 + + +Did Kim Taehyung of BTS (Tae(V) & his family receive any threats? + + +*Nyx Amara +*, +=F0=9D=93=AD=F0=9D=93=B2=F0=9D=93=B0=F0=9D=93=B2=F0=9D=93=BD=F0=9D=93=AA=F0= +=9D=93=B5 =F0=9D=93=AA=F0=9D=93=BB=F0=9D=93=BD=F0=9D=93=B2=F0=9D=93=BC=F0= +=9D=93=BD =E2=80=A2 Updated January 21 + + +RM And V got death threats from people after V commented on a picture +asking where he was in the photo. Of course it was meant as a joke but +people hated his comment. Rm, V... + +Read More =C2=BB + +[image: View] + + +1.3M + +[image: +Upvote] + + +588 + +[image: +Comment] + + +8 + + +Why is 0.1+0.2 not equal to 0.3 in most programming languages? + + +*Joe Zbiciak +*, +Developed practical algorithms actually used in production. =E2=80=A2 Updat= +ed +September 12 + + +Computers implement a wide range of arithmetic schemes. In some, such as +decimal floating point and rational arithmetic, 0.1 + 0.2 does equal 0.3. +One computer I own uses r... + +Read More =C2=BB + +[image: View] + + +1.6M + +[image: +Upvote] + + +218 + +[image: +Comment] + + +20 + +Read More in Your Feed + +Never miss a story. Designed for readers on the go. +[image: Download on the App Store] +[image: +Get it on Google Play] + +Missing out on test emails? Be sure to add us to your primary inbox. +This email was sent by test (605 Castro Street, Mountain View, CA 94041). +If you don't want to receive this type of email in the future, please +unsubscribe + +. +https://www.test.com + +--000000000000d4a5b005d3a220ef +Content-Type: text/html; charset="UTF-8" +Content-Transfer-Encoding: quoted-printable + +
Hi te= +jas, it's the sample of html email. we talk about it on discord.
<= +br>
I'm learning English as = +a first language for the past 13 years, but to be honest, my English was dr= +astically improved only in the past 3 years. I did my kindergarten in an IC= +SC school in Madura=E2=80=A6
Test's Digest
3D"test"
<= +img alt=3D"" height=3D"1" src=3D"https://www.test.com/qemail/mark_read?ct= +=3D1639910639224715&et=3D2&id=3Dce78432ee0f54fb1be4bb5e816968625&am= +p;request_id=3D1072810637337218477&src=3D1&st=3D1639910639224715&am= +p;stories=3D1_dU7XIw3qD61%7C1_a25mKlVfEJr%7C1_3TBVTr7aNnK%7C1_oBfvMdLpFVj%7= +C1_Ypw2TI1aZu8&uid=3DdoCdkaXtR9U&v=3D0" style=3D"object-fit:contain= +;border:0;display:block;outline:none;text-decoration:none;height:1px;width:= +100%;font-size:13px" width=3D"1">
=E2=80=8A
Top Stories For You

=E2=80=8A

<= +b>Nakul Sethuram, Student =E2=80=A2 Updated March 26

=E2=80=8A

<= +span style=3D"color:#333333">Gee Fishel, 20+ years of software development, f= +rom gaming to government to the enterprise. =E2=80=A2 Answered D= +ecember 6, 2017

=E2=80=8A
<= +/td>
= +

=E2=80=8A

<= +span style=3D"font-weight:700">Nyx Amara, =F0=9D=93=AD=F0=9D=93=B2=F0=9D=93=B0=F0=9D=93=B2=F0=9D=93=BD=F0=9D= +=93=AA=F0=9D=93=B5 =F0=9D=93=AA=F0=9D=93=BB=F0=9D=93=BD=F0=9D=93=B2=F0=9D= +=93=BC=F0=9D=93=BD =E2=80=A2 Updated January 21

= +

=E2=80=8A
Why is 0.1+0.2 not equal to 0.3 in most pro= +gramming languages?

Joe Zbiciak,= + Developed practical algorithms actually used in production. =E2=80=A2 Updated September 12

3D"View"

1.6M

3D"Upvote"3D"Comment"

20

= +
Read More in Your Feed
= +
Never miss a story. Designed for readers on the go.
=E2=80=8A
3D"Download
<= +/div> + +--000000000000d4a5b005d3a220ef-- diff --git a/spec/mailboxes/reply_mailbox_spec.rb b/spec/mailboxes/reply_mailbox_spec.rb index 97e47bf06245..acbc0929f380 100644 --- a/spec/mailboxes/reply_mailbox_spec.rb +++ b/spec/mailboxes/reply_mailbox_spec.rb @@ -7,6 +7,7 @@ let(:account) { create(:account) } let(:agent) { create(:user, email: 'agent1@example.com', account: account) } let(:reply_mail) { create_inbound_email_from_fixture('reply.eml') } + let(:mail_with_quote) { create_inbound_email_from_fixture('mail_with_quote.eml') } let(:conversation) { create(:conversation, assignee: agent, inbox: create(:inbox, account: account, greeting_enabled: false), account: account) } let(:described_subject) { described_class.receive reply_mail } let(:serialized_attributes) do @@ -95,5 +96,35 @@ expect(conversation_1.messages.last.content).to eq("Let's talk about these images:") end end + + context 'with quotes in email' do + let(:described_subject) { described_class.receive mail_with_quote } + + before do + # this UUID is hardcoded in the reply.eml, that's why we are updating this + conversation.uuid = '6bdc3f4d-0bec-4515-a284-5d916fdde489' + conversation.save + end + + it 'add the mail content as new message on the conversation' do + described_subject + expect(conversation.messages.last.content).to eq( + <<-BODY.strip_heredoc.chomp + Yes, I am providing you step how to reproduce this issue + + On Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test < tejaswini@chatwoot.com> wrote: + + > Any update on this? + > + > + + -- + * Sony Mathew* + Software developer + *Mob:9999999999 + BODY + ) + end + end end end diff --git a/spec/presenters/html_parser_spec.rb b/spec/presenters/html_parser_spec.rb new file mode 100644 index 000000000000..7b4868149a40 --- /dev/null +++ b/spec/presenters/html_parser_spec.rb @@ -0,0 +1,15 @@ +require 'rails_helper' +RSpec.describe HtmlParser do + include ActionMailbox::TestHelper + + describe 'parsed mail decorator' do + let(:html_mail) { create_inbound_email_from_fixture('welcome_html.eml').mail } + + it 'parse html content in the mail' do + decorated_html_mail = described_class.parse_reply(html_mail.text_part.decoded) + expect(decorated_html_mail[0..70]).to eq( + "I'm learning English as a first language for the past 13 years, but to " + ) + end + end +end diff --git a/spec/presenters/mail_presenter_spec.rb b/spec/presenters/mail_presenter_spec.rb index e3f535bda78c..c1049e9449eb 100644 --- a/spec/presenters/mail_presenter_spec.rb +++ b/spec/presenters/mail_presenter_spec.rb @@ -4,6 +4,7 @@ describe 'parsed mail decorator' do let(:mail) { create_inbound_email_from_fixture('welcome.eml').mail } + let(:html_mail) { create_inbound_email_from_fixture('welcome_html.eml').mail } let(:decorated_mail) { described_class.new(mail) } let(:mail_with_no_subject) { create_inbound_email_from_fixture('mail_with_no_subject.eml').mail } @@ -56,5 +57,13 @@ it 'give email from in downcased format' do expect(decorated_mail.from.first.eql?(mail.from.first.downcase)).to eq true end + + it 'parse html content in the mail' do + decorated_html_mail = described_class.new(html_mail) + expect(decorated_html_mail.subject).to eq('Fwd: How good are you in English? How did you improve your English?') + expect(decorated_html_mail.text_content[:reply][0..70]).to eq( + "I'm learning English as a first language for the past 13 years, but to " + ) + end end end
<= +tr>
=E2=80=8A
Missing out on test emails? Be sure to add us to your primar= +y inbox.
<= +div style=3D"font-family:system-ui,Segoe UI,sans-serif;font-size:11px;line-= +height:1.6;text-align:center;color:#939598">This email was sent by test (6= +05 Castro Street, Mountain View, CA 94041).
If you don't want to rec= +eive this type of email in the future, please unsubsc= +ribe.