IRC bot
#vnluser used to have a bot to grab link titles and search google. It didn't work very well with Vietnamese characters, so it was retired. Here's a replacement in rust and python. Its name is
luser
There are several things it could improve on:
[X]Handle gzip responses (news.zing.vn and baomoi.com): These sites always return gzip-encoded HTML, even when Accept-Encoding doesn't list it. You can check by comparingcurlvscurl --compressed.[ ]Configurable multiple channels support, possibly as commandline arguments, should also set owners.[ ]Reading secrets from files: API keys, identify passwords, channel passwords.[ ]Tests: It would be difficult to interface with IRC servers, but at least the handling functions should be tested.[ ]CLI interface: Like chatbot's, this will speed up testing.[X]NYtimes access: The cookie needs to be periodically rebaked. Theoretically cloaking as googlebot should work.[ ]Pod titles in rust: Quick-xml doesn't return the attributes iterator currently.[ ]Reconnection on ping timeout: IrcServer should already reconnect automatically, but possibly ipv6 tunneling is interfering.
Compilation to ARMv7 is a bit tricky:
- Set up cross-compilation toolchain
- Copy the target's
/usr/include/openssl,lib{crypto,ssl,z}.soto $PWD - Run cargo with:
TARGET_CFLAGS="-I $PWD" cargo rustc --release --target armv7-unknown-linux-gnueabihf -- -C linker=arm-linux-gnueabihf-gcc -lz -L $PWD
- Copy
target/armv7-unknown-linux-gnueabihf/release/luserto the target
You can run tests for the python version:
python -m doctest ~/Public/luser.py
Now on to the code. Unfortunately, tangling from this Org source loses indentations, so the python code can't be commented on line by line. Here are the API keys and rust build setup:
3JEW42-4XXE264A93
trnsl.1.1.20160210T093900Z.c6eacf09bbb65cfb.cc28de2ba798bc3bc118e9f8201b6e6cea697810
[package] name = "luser" version = "0.1.0" authors = ["Hoàng Đức Hiếu hdhoang@hdhoang.space"] [dependencies] irc = "*" scraper = "*" hyper = "*" regex = "*" quick-xml = "*" rustc-serialize = "*" url = "*"
Importing dependencies. Python really comes with batteries.
extern crate regex; extern crate irc; extern crate scraper; extern crate hyper; extern crate quick_xml; extern crate rustc_serialize; extern crate url; use regex::Regex; use irc::client::prelude::{IrcServer, Server, ServerExt, Config, Command, Response}; use hyper::client::Client; use std::io::Read; use std::collections::HashMap;
# -*- coding: utf-8 -*- # external batteries from bs4 import BeautifulSoup from irc import bot, connection from collections import defaultdict from random import randint from gzip import GzipFile import xml.etree.ElementTree as ET import json import sys import os if sys.version_info.major == 3: from urllib.request import urlopen, build_opener, HTTPCookieProcessor from urllib.parse import quote from http.client import HTTPConnection import html else: from urllib2 import urlopen, quote, build_opener, HTTPCookieProcessor from httplib import HTTPConnection from StringIO import StringIO from htmlparser import HTMLParser html = HTMLParser() reload(sys) sys.setdefaultencoding('utf8')
Set up logging.
import logging import logging.handlers logger = logging.getLogger(__file__) def setup_logging(filename, path=None, verbose=False): if not path: path = os.path.dirname(os.path.realpath(__file__)) file_log = logging.handlers.TimedRotatingFileHandler( os.path.join(path, filename), when="midnight", backupCount=31) file_log.setLevel(logging.DEBUG if verbose else logging.INFO) file_log.setFormatter(logging.Formatter( '%(asctime)-15s (%(name)s) %(message)s')) logger.addHandler(file_log)
Set up connection details. People should claim their names here.
const NAME: &'static str = "luser"; fn main() { let freenode = IrcServer::from_config(Config { owners: Some(vec![env!("USER").into()]), nickname: Some(NAME.into()), alt_nicks: Some((0..10).map(|n| format!("{}-{}", NAME, n)).collect()), server: Some("chat.freenode.net".into()), port: Some(8000), channels: Some(vec![format!("#vn{}", NAME), format!("#{}-test", NAME)]), ..Default::default() }) .unwrap(); freenode.identify().unwrap();
NAME = "luser" luser = bot.SingleServerIRCBot([("chat.freenode.net", 8000)], NAME, NAME, connect_factory=connection.Factory(ipv6=True)) def main(): setup_logging("luser.log") luser.start() def change_nick(c, e): new_nick = '{}-{}'.format(NAME, str(randint(0, 9))) print("Changing nick to", new_nick) c.nick(new_nick) luser.on_nicknameinuse = change_nick luser.on_nickcollision = lambda c, _: c.reconnect() def join_channels(c, e): c.join("#{}-test".format(NAME)) c.join("#vn" + NAME) luser.on_welcome = join_channels
Handling scaffolding
let get_title = &get_title; let wolframalpha = &wolframalpha; let google = &google; let translate = &translate; let handlers = [Handler(Regex::new(TITLE_REGEX).unwrap(), get_title), Handler(Regex::new(WA_REGEX).unwrap(), wolframalpha), Handler(Regex::new(GOOGLE_REGEX).unwrap(), google), Handler(Regex::new(TRANSLATE_REGEX).unwrap(), translate)];
def handling(c, e): return lusers[len(e.source) % len(lusers)] == c.get_nickname() def handle(c, e, msg): try: titles = title(msg) if titles and handling(c, e): c.privmsg(e.target, titles) if msg[0] not in ('.', '!', ':'): return if msg[1:6] == 'tell ': source = e.source.nick (target, _, line) = msg[6:].partition(' ') return relay_msg[target].append((source, line)) reply = '' if msg[1:3] == 'g ': reply = google(msg[3:]) if msg[1:4] == 'wa ': reply = wolframalpha(msg[4:]) if msg[1:4] == 'tr ': (lang, _, text) = msg[4:].partition(' ') reply = translate(lang, text) if reply: # Keep PRIVMSG under 512bytes c.privmsg(e.target, reply[:512 - len(e.target) - 50]) except Exception as e: logger.error('"%s" causes: %s' % (msg, str(e)))
List other lusers, and update that list when one joins or quits. This list is used by the lusers to decide whether to handle unaddressed messages. If the length of the IRC prefix 'nick!user@host' for a message indexes to its name, that luser responses.
lusers = [] def list_lusers(c, e): for luser in filter(lambda n: n.startswith(NAME), e.arguments[-1].split(' ')): if luser not in lusers: lusers.append(luser) lusers.sort() luser.on_namreply = list_lusers
let mut last_lines = HashMap::new(); let mut lusers = vec![]; 'messages: for message in freenode.iter() { let msg = message.unwrap(); if let Command::Response(Response::RPL_NAMREPLY, _, Some(ref names)) = msg.command { lusers.extend(names.split(' ') .filter(|n| n.starts_with(NAME)) .map(String::from)); lusers.sort(); lusers.dedup(); if !lusers.contains(&freenode.current_nickname().into()) { let _ = freenode.reconnect(); } continue 'messages; } if let Some(nick) = msg.source_nickname() {
Ignore bots and freenode
if nick.contains("bot") || nick.contains("freenode") { continue 'messages; }
if nick.starts_with(NAME) { let nick = String::from(nick); match msg.command {
Do not merge the following arms. Otherwise a join #c1 -> insert -> join #c2 -> remove sequence might happen.
Command::JOIN(..) => { if let Err(idx) = lusers.binary_search(&nick) { lusers.insert(idx, nick) } }
relay_msg = defaultdict(list) # dictdef relay(c, target, nick): for (source, line) in relay_msg[nick]: c.privmsg(target, "{}: <{}> {}".format(nick, source, line)) del relay_msg[nick] luser.on_nick = lambda c, e: relay(c, "#vnluser", e.target)
The next lambdas are abusing python logical operator, but they read like English.
def luser_joins(e): if e.source.nick not in lusers: lusers.append(e.source.nick) lusers.sort() def on_join(c, e): nick = e.source.nick if nick.startswith(NAME): return luser_joins(e) relay(c, e.target, nick) luser.on_join = on_join
Command::QUIT(..) => { if let Ok(idx) = lusers.binary_search(&nick) { lusers.remove(idx); } } _ => (), }
luser.on_quit = lambda c, e: e.source.startswith(NAME) and lusers.remove(e.source.nick)
continue 'messages; } }
Actual message processing. Ignore the other lusers.
last_lines = defaultdict(list) # dictdef on_pubmsg(c, e): nick = e.source.nick if nick.startswith(NAME): return my_nick = c.get_nickname() msg = e.arguments[0] if msg == "report!": return c.privmsg(e.target, report()) if msg.startswith('s/'): parts = msg.split('/') if (len(parts) >= 3 and handling(c, e) and parts[1] in last_lines[nick]): return c.privmsg(e.target, "{} meant: {}".format( nick, last_lines[nick].replace(parts[1], parts[2]))) else: last_lines[nick] = msg addressed = msg.startswith(my_nick) if addressed or handling(c, e) or 'http' in msg: if addressed: msg = msg[len(my_nick) + 2:] # remove addressing if msg.startswith('quit'): sys.exit() if msg.startswith('reload'): os.execl(sys.executable, sys.executable, __file__) handle(c, e, msg) luser.on_pubmsg = on_pubmsg
trimmed_line is here to hoist the trimmed line out of its
assignment block. Rust: the compiler knows better than you do.
let channel; let trimmed_line; let mut line; if let Command::PRIVMSG(ref target, ref message) = msg.command { channel = target; line = message } else { continue 'messages; } if line == "report!" { freenode.send(Command::PRIVMSG(channel.clone(), format!("operated by {} with source code {}", freenode.config() .owners .as_ref() .map(|v| v.join(", ")) .unwrap_or("someone anonymous" .into()), post_source_code()))) .unwrap(); continue 'messages; } if line.starts_with("s/") { let parts = line.split('/').collect::<Vec<_>>(); if parts.len() < 3 { continue 'messages; } if let Some(old_line) = last_lines.get(&msg.source_nickname().map(String::from)) { if lusers[msg.prefix.clone().unwrap().len() % lusers.len()] == freenode.current_nickname() { freenode.send(Command::PRIVMSG(channel.clone(), format!("{} meant to say \"{}\"", msg.source_nickname().unwrap(), (old_line as &str) .replace(parts[1], parts[2])))) .unwrap(); } } } else { last_lines.insert(msg.source_nickname().map(String::from), line.clone()); } let addressed = line.starts_with(freenode.current_nickname()); if addressed || lusers[msg.prefix.unwrap().len() % lusers.len()] == freenode.current_nickname() { if addressed { trimmed_line = line[freenode.current_nickname().len() + 2..].into(); line = &trimmed_line; } 'handling: for h in &handlers { if h.can_handle(line) { match h.run(line) { Err(e) => println!("{:?} causes {:?}", line, e), Ok(reply) => { if !reply.is_empty() { freenode.send(Command::PRIVMSG(channel.clone(), reply)).unwrap(); continue 'messages; } } } } } } } }
Rust handler scaffolding: casting into a common Error type and
associating regexes with their handling function. Rust is
surprisingly more object-happy than python.
#[derive(Debug)] enum Error { Data(String), Io(std::io::Error), Hyper(hyper::error::Error), Xml(quick_xml::error::Error), Json(rustc_serialize::json::ParserError), } struct Handler<'a>(Regex, &'a (Fn(&Regex, &str) -> Result<String, Error>)); impl<'a> Handler<'a> { fn can_handle(&self, line: &str) -> bool { self.0.is_match(&line) } fn run(&self, line: &str) -> Result<String, Error> { self.1(&self.0, &line) } }
Get title from URLs. The rust version only grabs the first URL in each message. Some domains with uninteresting titles are ignored:
"smbc-comics.com/", "libgen.io/", "xkcdb.com/"
"XKCDB: The: The #xkcd Quote Database", "Saturday Morning Breakfast Cereal", "Library Genesis"
const TITLE_REGEX: &'static str = r"https?:[^\s]+"; fn get_title(regex: &Regex, line: &str) -> Result<String, Error> { use hyper::header::{UserAgent, Cookie, CookiePair}; use scraper::{Html, Selector}; let url = regex.captures(&line).unwrap().expand("$0"); if ["smbc-comics.com/", "libgen.io/", "xkcdb.com/"].iter().any(|domain| url.contains(domain)) { return Ok(String::new()); } let mut response = try!(Client::new() .get(&url) .header(UserAgent("Firefox".into())) .header(Cookie(vec![CookiePair::new(// cookie to access NYtimes articles "NYT-S".into(), "0MOTYRtE4oUSHDXrmvxADeHEluv5kUWdpUdeFz9\ JchiAKuaKkdl/6loIV.Ynx4rkFI" .into())])) .send() .map_err(Error::Hyper)); let mut body = [0; 50_000]; response.read_exact(&mut body).ok(); if let Some(title_elem) = Html::parse_fragment(&String::from_utf8_lossy(&body)) .select(&Selector::parse("title").unwrap()) .next() { Ok(title_elem.first_child() .unwrap() .value() .as_text() .unwrap() .replace("\n", " ") .trim() .into()) } else { Err(Error::Data("Response has no title".into())) } }
def title(text): """ Retrieve titles from URL in text. >>> len(title('no url here')) 0 TODO This case should ignore the 404. >>> print(title('https://hdhoang.space/404 https://hdhoang.space/')) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... urllib.error.HTTPError: HTTP Error 404: Not Found >>> print(title('https://hdhoang.space/luser.html https://hdhoang.space/luser.html')) IRC bot / IRC bot >>> print(title('http://www.nytimes.com/2016/01/26/business/marvin-minsky-pioneer-in-artificial-intelligence-dies-at-88.html')) Marvin Minsky, Pioneer in Artificial Intelligence, Dies at 88 - The New York Times >>> print(title('http://www.baomoi.com/bao-nhieu-tan-bot-trung-quoc-da-duoc-nhap-ve-lam-tra-o-long-tea-plus/c/18486151.epi')) Bao nhiêu tấn bột Trung Quốc đã được nhập về làm trà Ô long TEA Plus? - GĐ&XH; >>> print(title('http://news.zing.vn/chi-tiet-ban-do-cam-duong-dip-29-o-ha-noi-post574142.html')) Chi tiết bản đồ cấm đường dịp 2/9 ở Hà Nội - Thời sự - Zing.vn >>> print(title('https://www.facebook.com/photo.php?fbid=261863914155282&set=a.261860180822322.1073742015.100009950253866&type=3&theater')) Vo Thanh Thuy - Vo Thanh Thuy added 8 new photos to the... | Facebook >>> print(title('https://imgur.com/M18GYfw?r https://imgur.com/GUFyoUa?r')) Glorious new key cap set for my work keyboard! - Imgur """ uninteresting = ["XKCDB: The: The #xkcd Quote Database", "Saturday Morning Breakfast Cereal", "Library Genesis"] titles = [] urls = filter(lambda w: w.startswith('http'), text.split()) for u in urls: request = build_opener(HTTPCookieProcessor()) request.addheaders = [('Accept-Encoding', 'gzip'), ('User-Agent', 'Mozilla/5.0')] response = request.open(u) if response.info().get('Content-Encoding') == 'gzip': if sys.version_info.major == 3: response = GzipFile(fileobj=response) else: response = GzipFile(fileobj=StringIO(response.read())) title = BeautifulSoup(response.read(50000), 'html.parser').title response.close() if (title and 'Imgur:' not in title.string and title.string not in uninteresting): titles.append(title.string.replace('\n', '').strip()) return ' / '.join(titles)
Ask Wolfram|Alpha, the knowledge engine.
const WA_REGEX: &'static str = concat!(r"^(\.|!|:)", "wa (?P.+)" ); fn wolframalpha(regex: &Regex, line: &str) -> Result<String, Error> { use hyper::header::ContentLength; use quick_xml::{XmlReader, Event}; let mut response = try!(Client::new() .get(®ex.captures(&line) .unwrap() .expand("http://api.wolframalpha.\ com/v2/query?format=plaintext&appid=\ 3JEW42-4XXE264A93&input=$query")) .send() .map_err(Error::Hyper)); let mut xml = String::with_capacity(**response.headers.get::<ContentLength>().unwrap() as usize); try!(response.read_to_string(&mut xml).map_err(Error::Io)); let tree = XmlReader::from_str(&xml).trim_text(true); let mut answers = vec![]; for event in tree { match event { Ok(Event::Start(ref elem)) if elem.name() == b"pod" => { answers.push(String::from_utf8(try!(elem.attributes() .next() .unwrap() .map_err(Error::Xml)) .1 .into()) .unwrap() + ": ") } Ok(Event::Text(elem)) =>{ answers.push(try!(elem.into_string().map_err(Error::Xml)) + " /") } _ => (), } } Ok(answers.join(" ")) }
def wolframalpha(text): """ Query WolframAlpha about text. >>> print(wolframalpha('mass of sol')) Input interpretation: Sun | mass / Result: 1.988435×10^30 kg (kilograms) / Unit conversions: 4.383749×10^30 lb (pounds) / 2.191874×10^27 sh tn (short tons) / 1.988435×10^33 grams / 1 M_☉ (solar ma http://wolframalpha.com/?input=mass%20of%20sol Check URL encoding: >>> print(wolframalpha('4+6')) Input: 4+6 / Result: 10 / Number name: ten / Number line: Manipulatives illustration: | + | | | 4 | | 6 | | 10 / Typical human computation times: age 6: 5.3 seconds | age 8: 2.6 seconds | age 10: 1.7 seconds | age 18: 0.93 seconds (ignoring concentration, repetition, variations in education, etc.) / >>> print(wolframalpha('é')) Input interpretation: é (character) / Visual form: Name: Latin small letter e with acute / Positions in alphabets: Czech | 9th letter (33rd letter from the end) Slovak | 12th letter (35th letter from http://wolframalpha.com/?input=%C3%A9 """ r = urlopen( 'http://api.wolframalpha.com/v2/query?format=plaintext&appid=3JEW42-4XXE264A93&input=' + quote(text)) tree = ET.parse(r) reply = '' for n in tree.iter(): if n.tag == 'pod': reply += n.attrib['title'] + ': ' if n.tag == 'plaintext' and n.text and len(n.text.strip()): reply += n.text + ' / ' if len(reply) > 512: reply = reply[:200] + " http://wolframalpha.com/?input=" + quote(text) r.close() return reply.replace('\n', ' ')
Returns the first Google result.
const GOOGLE_REGEX: &'static str = concat!(r"^(\.|!|:)", "g (?P.+)" ); fn google(regex: &Regex, line: &str) -> Result<String, Error> { use rustc_serialize::json::Json; // API: https://developers.google.com/web-search/docs/#code-snippets let mut response = try!(Client::new() .get(®ex.captures(&line) .unwrap() .expand("https://ajax.googleapis.\ com/ajax/services/search/web?v=1.\ 0&rsz=1&q=$query")) .send() .map_err(Error::Hyper)); let json = try!(Json::from_reader(&mut response).map_err(Error::Json)); let results = try!(json.search("results").ok_or(Error::Data("No results".into()))); if results.as_array().unwrap().is_empty() { return Ok("No results".into()); } let url = try!(results[0] .find("unescapedUrl") .ok_or(Error::Data("No url".into())) .map(|j| j.as_string().unwrap())); let title = try!(results[0] .find("titleNoFormatting") .ok_or(Error::Data("No title".into())) .map(|j| j.as_string().unwrap())); Ok(format!("{} {}", title, url)) }
def google(text): """ Retrieve the first result from a google for text. >>> print(google('á')) Á - Wikipedia, the free encyclopedia https://en.wikipedia.org/wiki/%C3%81 >>> print(google('trump south-china sea')) Donald Trump weighs in on China's island-building in the South ... http://www.politifact.com/truth-o-meter/statements/2016/apr/04/donald-trump/donald-trump-weighs-chinas-island-building-south-c/ >>> print(google('naesuth no result here')) 0 result """ r = urlopen( 'https://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=1&q=' + quote(text)) data = json.loads(r.read().decode())['responseData'] r.close() if not data['results']: return '0 result' return html.unescape(data['results'][0]['titleNoFormatting']) + \ ' ' + data['results'][0]['unescapedUrl']
Translates using Yandex:
const TRANSLATE_REGEX: &'static str = concat!(r"^(\.|!|:)", "tr (?P[^ ]+) (?P ); fn translate(regex: &Regex, line: &str) -> Result<String, Error> { use rustc_serialize::json::Json; let mut response = try!(Client::new() .get(®ex.captures(&line) .unwrap() .expand("https://translate.yandex.\ net/api/v1.5/tr.\ json/translate?key=trnsl.1.1.20160210T093900Z.c6eacf09bbb65cfb.cc28de2ba798bc3bc118e9f8201b6e6cea697810&text=$text&\ lang=$lang")) .send() .map_err(Error::Hyper)); let json = try!(Json::from_reader(&mut response).map_err(Error::Json)); let reply = match json.find("code").unwrap().as_u64().unwrap() { 200 => { format!("{}: {}", json.find("lang").unwrap().as_string().unwrap(), json.find("text").unwrap()[0].as_string().unwrap()) } 501 => json.find("message").unwrap().as_string().unwrap().into(), _ => format!("{:?}", json.as_string()), }; Ok(reply) }.+)"
def translate(direction, text): """ Translate text according to direction. >>> print(translate('la-en', 'ad astra per aspera')) la-en: to the stars through rough >>> print(translate('vi', "you think you're good?")) en-vi: ngươi nghĩ ngươi giỏi không? >>> print(translate('en', 'mày nghĩ mày ngon?')) vi-en: you think you're so tough? >>> print(translate('jbo', 'hello')) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... urllib.error.HTTPError: HTTP Error 400: BAD REQUEST """ if not text: return 'Missing text' r = urlopen( 'https://translate.yandex.net/api/v1.5/tr.json/translate?key=trnsl.1.1.20160210T093900Z.c6eacf09bbb65cfb.cc28de2ba798bc3bc118e9f8201b6e6cea697810&text={}&lang={}' .format( quote(text), direction)) data = json.loads(r.read().decode()) r.close() return data['lang'] + ": " + data['text'][0]
Posts its own source code:
fn post_source_code() -> String { use url::form_urlencoded; let form = [("read:1", "3"), ("name:1", "main.rs"), ("f:1", include_str!("main.rs")), ("read:2", "3"), ("name:2", "Cargo.toml"), ("f:2", include_str!("../Cargo.toml"))]; let result = Client::new() .post("http://ix.io") .body(&form_urlencoded::serialize(form.iter())) .send(); match result { Ok(mut response) => { let mut reply = String::new(); let _ = response.read_to_string(&mut reply); reply.replace('\n', " ") } Err(e) => format!("unable to post: {:?}", e), } }
def report(): """ Return owner and source code >>> print(report()) # doctest: +ELLIPSIS operated by ... with source code http://ix.io/... """ conn = HTTPConnection('ix.io') conn.request( 'POST', '/', 'read:1=3&name:1=luser.py&f:1=' + quote(open(__file__).read())) return "operated by {} with source code {}".format( os.getenv('USER'), conn.getresponse().read().decode().strip())
if __name__ == '__main__': main()