From 92aa5bb2329c39cf97d4399839989e7401820ae4 Mon Sep 17 00:00:00 2001
From: lassulus <lass@blue.r>
Date: Fri, 6 Jul 2018 17:42:04 +0200
Subject: Reaktor url-title: fix some issues with weird urls

ref: https://irc-bot-science.clsr.net/
---
 krebs/5pkgs/simple/Reaktor/plugins.nix | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'krebs/5pkgs')
diff --git a/krebs/5pkgs/simple/Reaktor/plugins.nix b/krebs/5pkgs/simple/Reaktor/plugins.nix
index cd389366e..4a7917b68 100644
--- a/krebs/5pkgs/simple/Reaktor/plugins.nix
+++ b/krebs/5pkgs/simple/Reaktor/plugins.nix
@@ -121,21 +121,27 @@ rec {
     pattern = "^.*(?P<args>http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+).*$$";
     path = with pkgs; [ curl perl ];
     script = pkgs.writePython3 "url-title" [ "beautifulsoup4" "lxml" ] ''
+      import cgi
       import sys
       import urllib.request
       from bs4 import BeautifulSoup
 
       try:
-          soup = BeautifulSoup(urllib.request.urlopen(sys.argv[1]), "lxml")
-          title = soup.find('title').string
+          resp = urllib.request.urlopen(sys.argv[1])
+          if resp.headers['content-type'].find('text/html') >= 0:
+              soup = BeautifulSoup(resp.read(16000), "lxml")
+              title = soup.find('title').string
 
-          if title:
-              if len(title) > 512:
-                  print('message to long, skipped')
-              elif len(title.split('\n')) > 5:
-                  print('to many lines, skipped')
-              else:
-                  print(title)
+              if title:
+                  if len(title) > 450:
+                      print('message to long, rest skipped')
+                  elif len(title.split('\n')) > 5:
+                      print('to many lines, skipped')
+                  else:
+                      print(title)
+          else:
+              cd_header = resp.headers['content-disposition']
+              print(cgi.parse_header(cd_header)[1]['filename'])
       except:  # noqa: E722
           pass
     '';
-- 
cgit v1.2.3


From 48fdd37c032bfb2e53ca94ee5b48633a24e1e897 Mon Sep 17 00:00:00 2001
From: lassulus <lass@blue.r>
Date: Sat, 7 Jul 2018 14:40:59 +0200
Subject: Reaktor url-title: show 450 chars max

---
 krebs/5pkgs/simple/Reaktor/plugins.nix | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'krebs/5pkgs')

diff --git a/krebs/5pkgs/simple/Reaktor/plugins.nix b/krebs/5pkgs/simple/Reaktor/plugins.nix
index 4a7917b68..3730b9e66 100644
--- a/krebs/5pkgs/simple/Reaktor/plugins.nix
+++ b/krebs/5pkgs/simple/Reaktor/plugins.nix
@@ -127,18 +127,17 @@ rec {
       from bs4 import BeautifulSoup
 
       try:
-          resp = urllib.request.urlopen(sys.argv[1])
+          req = urllib.request.Request(sys.argv[1])
+          req.add_header('user-agent', 'Reaktor-url-title')
+          resp = urllib.request.urlopen(req)
           if resp.headers['content-type'].find('text/html') >= 0:
               soup = BeautifulSoup(resp.read(16000), "lxml")
               title = soup.find('title').string
 
-              if title:
-                  if len(title) > 450:
-                      print('message to long, rest skipped')
-                  elif len(title.split('\n')) > 5:
-                      print('to many lines, skipped')
-                  else:
-                      print(title)
+              if len(title.split('\n')) > 5:
+                  title = '\n'.join(title.split('\n')[:5])
+
+              print(title[:450])
           else:
               cd_header = resp.headers['content-disposition']
               print(cgi.parse_header(cd_header)[1]['filename'])
-- 
cgit v1.2.3