This commit is contained in:
lnkr 2022-04-18 03:09:13 +03:00
parent a2f363cbca
commit 592bcbbd4b
6 changed files with 1181 additions and 0 deletions

1
bannedusernames.txt Normal file
View file

@ -0,0 +1 @@
Бэдыч

0
bannedwords.txt Normal file
View file

10
helpers.py Normal file
View file

@ -0,0 +1,10 @@
class RID(object):
def __init__(self):
self._id = 0
@property
def x(self):
self._id += 1
return self._id
@x.setter
def x(self, value):
self._id = value

592
modbot.py Normal file
View file

@ -0,0 +1,592 @@
# Tested on Python 3.10
# Reqs:
# python -m pip install requests
from datetime import datetime, timedelta
from threading import Thread
from time import sleep
import requests
import re
import base64
import uuid
import random
import settings
import xmltodict
import queue
import os
from helpers import RID
class ModBot:
def __init__(self, instance, username, password, chatroom) -> None:
self.peertube_instance = instance
self.peertube_username = username
self.peertube_password = password
self.chat_room = chatroom
self._prefixed_instance = f"https://{instance}"
self.rid = RID()
self.uid = RID()
self.users = {}
self.msg_history = []
self.msg_history_max_len = 500
self.restricted_mode = 0
# Timeout length, if !timeout command was send without 2nd argument
self.timeout_default = 300
# Rate limit settings
# Time window (in seconds) to measure rates
self.ratelimit_timewindow = 10
# How many messages one user allowed to send in 'ratelimit_timewindow' seconds
self.ratelimit_maxmessages = 3
# Length of timeout (in seconds) that will be applied to user who exceeded the limits
self.ratelimit_timeout = 60
self.resource_pref = "modbot.py"
self.bot_msg_prefix = "/me [ModBot]: "
self.last_msg = {"username": "", "count": 0}
self.msg_queue = queue.Queue()
self.banned_usernames = []
self._bannedusernames_filename = "bannedusernames.txt"
self._bannedusernames_stamp = 0
self.banned_words = []
self._bannedwords_filename = "bannedwords.txt"
self._bannedwords_stamp = 0
def _bosh_send(self, str):
req = requests.post(f'{self._prefixed_instance}{self.bosh_service_url}',
headers={'Content-Type': f'text/xml; charset=utf-8'}, timeout=60, data=str.encode('utf-8'))
#print(f"REQUEST: {str}\nRESPONSE: {req.text}\n")
return req.text
def _generate_body_headers(self):
return f'rid="{self.rid.x}" sid="{self.authid}" xmlns="http://jabber.org/protocol/httpbind"'
def update_user(self, presenses):
if isinstance(presenses, dict):
presenses = [presenses, ]
for presense in presenses:
username = presense["@from"].split("/", 1)[1]
# If you are getting error HERE, your account... got banned from your room???
x_items = presense["x"]["item"]
if isinstance(x_items, dict):
x_items = [x_items, ]
for x_item in x_items:
userdata = {
"role": x_item["@role"],
"affiliation": x_item["@affiliation"],
"jid": x_item["@jid"].split("/", 1)[0],
}
new_user = False
if "@nick" in x_item:
for prev_username, prev_user in self.users.items():
if prev_user["jid"] == userdata["jid"]:
newnick = x_item['@nick']
self.users[newnick] = self.users[prev_username]
del self.users[prev_username]
user = self.users[newnick]
print(f"[{datetime.now().strftime('%H:%M:%S')}] ({user['uid']}|{prev_username}|{user['jid']} changed nickname to {newnick}", end="", flush=True)
if self.users[newnick]["role"] == "visitor":
print(f" and was muted too)")
self.bosh_user_mute(newnick)
else: print(")")
break
else:
if username in self.users:
self.users[username]["role"] = userdata["role"]
self.users[username]["affiliation"] = userdata["affiliation"]
self.users[username]["jid"] = userdata["jid"]
else:
self.users[username] = userdata
self.users[username]["uid"] = self.uid.x
new_user = True
if not userdata["role"] in ("visitor", "none"):
if new_user:
if self.restricted_mode == 1 and "@anon." in userdata['jid']:
self.bosh_user_mute(username)
if self.restricted_mode == 2 and "@anon." in userdata['jid']:
self.bosh_user_mute(username)
elif self.restricted_mode == 3 and userdata["role"] != "moderator":
self.bosh_user_mute(username)
for banned_username in self.banned_usernames:
if banned_username in username:
print(f"User '{username}' was muted: contains banned username '{banned_username}'")
self.bosh_user_mute(username)
def send_msg(self, msg, cmd=False):
if cmd == False:
someid = uuid.uuid4()
req_body = f'<body {self._generate_body_headers()}><message from="{self._internal_user_addr}" id="{someid}" to="{self._internal_room_addr}" type="groupchat" xmlns="jabber:client"><body>{self.bot_msg_prefix}{msg}</body><active xmlns="http://jabber.org/protocol/chatstates"/><origin-id id="{someid}" xmlns="urn:xmpp:sid:0"/></message></body>'
return self._bosh_send(req_body)
else:
print(msg)
def connect(self):
_pi = self._prefixed_instance
_room = self.chat_room
_inst = self.peertube_instance
print(f'Peertube: Getting client tokens...')
req = requests.get(f'{_pi}/api/v1/oauth-clients/local')
client_tokens = req.json()
print(f'Peertube: Authorizing and getting user token...')
req = requests.post(f'{_pi}/api/v1/users/token', data={
"client_id": client_tokens["client_id"],
"client_secret": client_tokens["client_secret"],
"grant_type": "password",
"response_type": "code",
"username": self.peertube_username,
"password": self.peertube_password,
})
token_resp = req.json()
if "access_token" in token_resp:
access_token = req.json()["access_token"]
else:
print(f"Error while logging in: {token_resp}")
exit(1)
print(f"Webchat: Accessing room {_room} at instance {_inst}")
req = requests.get(f'{_pi}/plugins/livechat/router/webchat/room/{_room}')
# Getting errors there? Does the room even exist?
self.livechat_version = re.search(
r"/livechat/(.*)/static/", req.text, re.MULTILINE).group(1)
print(f'Webchat: Livechat version: {self.livechat_version}')
self.bosh_service_url = re.search(r"boshServiceUrl: '(.*)',",
req.text, re.MULTILINE).group(1)
self.authentication_url = re.search(r"authenticationUrl: '(.*)',",
req.text, re.MULTILINE).group(1).removeprefix(_pi)
headers = {'authorization': f'Bearer {access_token}'}
req = requests.get(f'{_pi}{self.authentication_url}', headers=headers)
creds = {}
if (req.status_code == 200):
creds = req.json()
print(f"Webchat: Signed in as '{creds['nickname']}' ({creds['jid']})")
else:
print(f"Webchat: ERROR: UNAUTHORIZED (STATUS:{req.status_code})")
exit(1)
self.creds = creds
headers = {'Content-Type': f'text/xml; charset=utf-8'}
authb64 = base64.b64encode(f'\0{creds["nickname"]}\0{creds["password"]}'.encode('ASCII')).decode()
resource_id = f'{self.resource_pref}-{random.randint(10000000, 99999999)}'
self.resource_id = resource_id
print(f'boshService: Fetching "authid"...')
req_body = f'<body content="text/xml; charset=utf-8" hold="1" rid="{self.rid.x}" to="{_inst}" ver="1.6" wait="59" xml:lang="en" xmlns="http://jabber.org/protocol/httpbind" xmlns:xmpp="urn:xmpp:xbosh" xmpp:version="1.0"/>'
req = self._bosh_send(req_body)
authid = xmltodict.parse(req)["body"]["@authid"]
self.authid = authid
self._internal_user_addr = f'{creds["nickname"]}@{_inst}/{resource_id}'
self._internal_room_addr = f'{_room}@room.{_inst}'
print(f'boshService: Authorizing...')
req_body = f'<body {self._generate_body_headers()}><auth mechanism="PLAIN" xmlns="urn:ietf:params:xml:ns:xmpp-sasl">{authb64}</auth></body>'
req = self._bosh_send(req_body)
print(f'boshService: Restarting datastream...')
req_body = f'<body {self._generate_body_headers()} to="{_inst}" xml:lang="en" xmlns:xmpp="urn:xmpp:xbosh" xmpp:restart="true"/>'
req = self._bosh_send(req_body)
print(f'boshService: Binding auth...')
req_body = f'<body {self._generate_body_headers()}><iq id="_bind_auth_2" type="set" xmlns="jabber:client"><bind xmlns="urn:ietf:params:xml:ns:xmpp-bind"><resource>{resource_id}</resource></bind></iq></body>'
req = self._bosh_send(req_body)
print(f'boshService: Getting session...')
req_body = f'<body {self._generate_body_headers()}><iq id="_session_auth_2" type="set" xmlns="jabber:client"><session xmlns="urn:ietf:params:xml:ns:xmpp-session"/></iq></body>'
req = self._bosh_send(req_body)
print(f'boshService: Joining room, sending presence...')
self.bosh_update_presenses()
print(f'Done, check if there is ModBot message in chat.')
self.send_msg("Online!")
def bosh_update_presenses(self):
req_body = f'<body {self._generate_body_headers()}><presence from="{self._internal_user_addr}" to="{self._internal_room_addr}/{self.creds["nickname"]}" xmlns="jabber:client"><x xmlns="http://jabber.org/protocol/muc"><history maxstanzas="0"/></x><c hash="sha-1" node="https://conversejs.org" ver="vFjUiQWh2ew0hsRBxf7LNFK8ol0=" xmlns="http://jabber.org/protocol/caps"/></presence></body>'
req = self._bosh_send(req_body)
presenses = xmltodict.parse(req)["body"]
if "presence" in presenses:
presenses = presenses["presence"]
if isinstance(presenses, dict):
presenses = [presenses, ]
for presense in presenses:
self.update_user(presense)
def bosh_user_unmute(self, username):
req_body = f'<body {self._generate_body_headers()}><iq id="{uuid.uuid4()}:sendIQ" to="{self._internal_room_addr}" type="set" xmlns="jabber:client"><query xmlns="http://jabber.org/protocol/muc#admin"><item nick="{username}" role="participant"><reason/></item></query></iq></body>'
return self._bosh_send(req_body)
def bosh_user_mute(self, username):
req_body = f'<body {self._generate_body_headers()}><iq id="{uuid.uuid4()}:sendIQ" to="{self._internal_room_addr}" type="set" xmlns="jabber:client"><query xmlns="http://jabber.org/protocol/muc#admin"><item nick="{username}" role="visitor"><reason/></item></query></iq></body>'
return self._bosh_send(req_body)
def bosh_user_ban(self, jid):
req_body = f'<body {self._generate_body_headers()}><iq id="{uuid.uuid4()}:sendIQ" to="{self._internal_room_addr}" type="set" xmlns="jabber:client"><query xmlns="http://jabber.org/protocol/muc#admin"><item affiliation="outcast" jid="{jid}"><reason/></item></query></iq></body>'
return self._bosh_send(req_body)
def bosh_retract_msg(self, msg_id):
req_body = f'<body {self._generate_body_headers()}><iq id="{uuid.uuid4()}:sendIQ" to="{self._internal_room_addr}" type="set" xmlns="jabber:client"><apply-to id="{msg_id}" xmlns="urn:xmpp:fasten:0"><moderate xmlns="urn:xmpp:message-moderate:0"><retract xmlns="urn:xmpp:message-retract:0"/><reason></reason></moderate></apply-to></iq></body>'
return self._bosh_send(req_body)
def wipe_user_msg(self, user_id):
newmsg = []
for msg in self.msg_history:
if msg["user_id"] == user_id:
self.bosh_retract_msg(msg["msg_id"])
else:
newmsg.append(msg)
self.msg_history = newmsg
def process_command(self, body, cmd=False):
if body == "!help":
self.send_msg("List of commands:\n" +
"!users - Show list of online users\n" +
"!usersall - Show list of all known users\n" +
"!mute USERID - Mute user by ID\n" +
"!unmute USERID - Unmute user by ID\n"
"!timeout USERID SEC - Mute user by ID for SEC seconds\n"
"!wipe USERID - Retract user's recent messages\n"
"!ban USERID - Ban user by ID and retract his last messages\n"
"!mode 0 - Lift mutes, everyone can talk\n"
"!mode 1 - Mute anons-newcomers, existing and registered users can talk\n"
"!mode 2 - Mute all anons, registered users can talk\n"
"!mode 3 (or !shutup) - Siege mode: mute everyone except moderators.\n"
"!ratelimits M W S - Rewrite ratelimits: allow M messages per W seconds before S seconds timeout.\n", cmd)
elif body == "!users":
usersstr = "List of online users:\n"
usersstr += f"ID) [name] (jid, role, affiliation)\n"
for username, user in self.users.items():
if user['role'] != "none":
usersstr += f"{user['uid']}) [{username}] ({user['jid']}, {user['role']}, {user['affiliation']})\n"
self.send_msg(usersstr, cmd)
elif body == "!usersall":
usersstr = "List of known users:\n"
usersstr += f"ID) [name] (jid, role, affiliation)\n"
for username, user in self.users.items():
usersstr += f"{user['uid']}) [{username}] ({user['jid']}, {user['role']}, {user['affiliation']})\n"
self.send_msg(usersstr, cmd)
elif body.startswith("!mute"):
_id = body[6:]
if _id.isdigit():
_id = int(_id)
found = False
for username, user in self.users.items():
if user["uid"] == _id:
found = True
self.bosh_user_mute(username)
self.send_msg(f"'{username}' muted", cmd)
if not found:
self.send_msg(f"Can't find user with ID={_id}", cmd)
else:
self.send_msg(f"Error: '{_id}' is not integer. Provide numeric user ID (you can list users with !users)", cmd)
elif body.startswith("!unmute"):
_id = body[8:]
if _id.isdigit():
_id = int(_id)
found = False
for username, user in self.users.items():
if user["uid"] == _id:
found = True
self.bosh_user_unmute(username)
self.send_msg(f"'{username}' unmuted", cmd)
if not found:
self.send_msg(f"Can't find user with ID={_id}", cmd)
else:
self.send_msg(f"Error: '{_id}' is not integer. Provide numeric user ID (you can list users with !users)", cmd)
elif body.startswith("!timeout"):
args = body[9:].split(" ")
_id = args[0]
_time = args[1] if len(args) > 1 else str(self.timeout_default)
if _id.isdigit() and _time.isdigit():
_id = int(_id)
_time = int(_time)
found = False
for username, user in self.users.items():
if user["uid"] == _id:
found = True
self.bosh_user_mute(username)
self.users[username]["timeout_until"] = datetime.now() + timedelta(seconds=_time)
self.send_msg(f"'{username}' muted for {_time} seconds", cmd)
if not found:
self.send_msg(f"Can't find user with ID={_id}", cmd)
else:
self.send_msg("Error: malformed command", cmd)
elif body.startswith("!ratelimits"):
args = body[12:].split(" ")
if len(args) == 3:
_msgs = args[0]
_wnd = args[1]
_time = args[2]
if _msgs.isdigit() and _wnd.isdigit() and _time.isdigit():
_msgs = int(_msgs)
_wnd = int(_wnd)
_time = int(_time)
self.ratelimit_maxmessages = _msgs
self.ratelimit_timewindow = _wnd
self.ratelimit_timeout = _time
self.send_msg(f"Ratelimits updated: users who send >={_msgs} messages in {_wnd} seconds will be timed out for {_time} seconds.", cmd)
else:
self.send_msg("Error: malformed command", cmd)
else:
self.send_msg("Error: malformed command", cmd)
elif body == "!mode 0":
for username, user in self.users.items():
self.bosh_user_unmute(username)
self.restricted_mode = 0
self.send_msg("Code 🟩-0, everyone unmuted. Welcome back!", cmd)
elif body == "!mode 1":
self.restricted_mode = 1
self.send_msg("Code 🟨-0, newcomers will be muted.", cmd)
elif body == "!mode 2":
for username, user in self.users.items():
if "@anon." in user['jid']:
self.bosh_user_mute(username)
self.restricted_mode = 2
self.send_msg("Code 🟨-1, anon users muted.", cmd)
elif body == "!mode 3" or body == "!shutup":
for username, user in self.users.items():
if user["role"] != "moderator":
self.bosh_user_mute(username)
self.restricted_mode = 3
self.send_msg("Code 🟥-0, everyone muted. Stand by.", cmd)
elif body.startswith("!wipe"):
_id = body[6:]
if _id.isdigit():
_id = int(_id)
found = False
for username, user in self.users.items():
if user["uid"] == _id:
found = True
self.wipe_user_msg(_id)
self.send_msg(f"'{username}' msgs wiped.", cmd)
if not found:
self.send_msg(f"Can't find user with ID={_id}", cmd)
else:
self.send_msg(f"Error: '{_id}' is not integer. Provide numeric user ID (you can list users with !users)", cmd)
elif body.startswith("!ban"):
_id = body[5:]
if _id.isdigit():
_id = int(_id)
found = False
for username, user in self.users.items():
if user["uid"] == _id:
found = True
if user["jid"] == self.creds['jid']:
self.send_msg(f"Nope, won't ban myself ({self.creds['jid']})", cmd)
else:
self.bosh_user_ban(user["jid"])
self.send_msg(f"'{username}' banned.", cmd)
self.wipe_user_msg(_id)
if not found:
self.send_msg(f"Can't find user with ID={_id}", cmd)
else:
self.send_msg(f"Error: '{_id}' is not integer. Provide numeric user ID (you can list users with !users)", cmd)
def _receiver_loop(self):
while True:
req_body = f'<body {self._generate_body_headers()}/>'
resp = self._bosh_send(req_body)
self.msg_queue.put(resp)
def start_receiver_loop(self):
t = Thread(target=self._receiver_loop)
t.daemon = True
t.start()
def _reader_loop(self):
while True:
resp = self.msg_queue.get()
#self.bosh_update_presenses()
msg = xmltodict.parse(resp)["body"]
#print(self.users)
#print(json.dumps(msg, indent=2) + "\n")
time_now = datetime.now()
stamp = os.stat(self._bannedusernames_filename).st_mtime
if self._bannedusernames_stamp != stamp:
self._bannedusernames_stamp = stamp
with open(self._bannedusernames_filename, 'r', encoding='utf-8') as file:
self.banned_usernames = [line.strip() for line in file]
print(f"Banned usernames list updated (count: {len(self.banned_usernames)}).")
stamp = os.stat(self._bannedwords_filename).st_mtime
if self._bannedwords_stamp != stamp:
self._bannedwords_stamp = stamp
with open(self._bannedwords_filename, 'r', encoding='utf-8') as file:
self.banned_words = [line.strip() for line in file]
print(f"Banned words list updated (count: {len(self.banned_words)}).")
for username, user in self.users.items():
if "timeout_until" in user:
if user["timeout_until"] < time_now:
self.send_msg(f"'{username}' unmuted after timeout.")
self.bosh_user_unmute(username)
del self.users[username]["timeout_until"]
# should probably be removed since there is a bosh_update_presenses() call
if "presence" in msg:
self.update_user(msg["presence"])
if "message" in msg:
chatmsgs = msg["message"]
if isinstance(chatmsgs, dict):
chatmsgs = [chatmsgs, ]
for chatmsg in chatmsgs:
if "body" in chatmsg:
# If it's actually a message in the chat
body = chatmsg["body"]
username = chatmsg["@from"].split("/", 1)[1]
msguser = self.users[username]
self.msg_history.append({
"user_id": msguser['uid'],
"body": body,
"msg_id": chatmsg["stanza-id"]["@id"]
})
if (len(self.msg_history) > self.msg_history_max_len):
self.msg_history.pop(0)
if self.last_msg["username"] == username:
self.last_msg["count"] += 1
else:
self.last_msg["username"] = username
self.last_msg["count"] = 1
msgtoprint = (body[:50] + '..') if len(body) > 50 else body
print(f"[{datetime.now().strftime('%H:%M:%S')}] ({msguser['uid']}|{username}|{msguser['jid']}|x{self.last_msg['count']}) {msgtoprint}")
if self.users[username]["role"] == "moderator":
self.process_command(body)
else:
# Start of ratelimit maneuvers
user = self.users[username]
if not "msgtimes" in user:
self.users[username]["msgtimes"] = []
rl_tw = timedelta(seconds=self.ratelimit_timewindow)
for timestamp in user["msgtimes"]:
if timestamp < time_now - rl_tw:
self.users[username]["msgtimes"].remove(timestamp)
self.users[username]["msgtimes"].append(time_now)
msg_duplicate = False
if "lastmsg" in self.users[username]:
if self.users[username]["lastmsg"] == hash(body):
msg_duplicate = True
msgcount = 0
for timestamp in self.users[username]["msgtimes"]:
if timestamp >= time_now - rl_tw:
if msg_duplicate:
self.users[username]["timeout_until"] = datetime.now() + timedelta(seconds=self.ratelimit_timeout)
self.send_msg(f"'{username}' muted for {self.ratelimit_timeout} seconds. Reason: spamming")
self.bosh_user_mute(username)
break
else:
msgcount += 1
if msgcount >= self.ratelimit_maxmessages:
self.users[username]["timeout_until"] = datetime.now() + timedelta(seconds=self.ratelimit_timeout)
self.send_msg(f"'{username}' muted for {self.ratelimit_timeout} seconds. Reason: ratelimits")
self.bosh_user_mute(username)
self.users[username]["lastmsg"] = hash(body)
# End of ratelimit maneuvers
for banned_word in self.banned_words:
if banned_word in body.lower():
self.users[username]["timeout_until"] = datetime.now() + timedelta(seconds=self.ratelimit_timeout)
self.send_msg(f"'{username}' muted for {self.ratelimit_timeout} seconds. Reason: banned word")
self.bosh_user_mute(username)
#print(json.dumps(xmltodict.parse(req)["body"], indent=2) + "\n")
# Put some rules or commands down there that apply for any user, regardless of role
else:
pass
#if not "urn:xmpp:hints" in req:
#print(json.dumps(xmltodict.parse(req)["body"], indent=2) + "\n")
else:
pass
#if not "urn:xmpp:hints" in req:
#print(json.dumps(xmltodict.parse(req)["body"], indent=2) + "\n")
def start_reader_loop(self):
t = Thread(target=self._reader_loop)
t.daemon = True
t.start()
def run(self):
self.connect()
self.start_receiver_loop()
self.start_reader_loop()
while True:
command = input()
self.process_command(command, cmd=True)
if __name__ == "__main__":
bot = ModBot(
settings.instance,
settings.username,
settings.password,
settings.room
)
bot.timeout_default = settings.timeout_default
bot.ratelimit_timewindow = settings.ratelimit_timewindow
bot.ratelimit_maxmessages = settings.ratelimit_maxmessages
bot.ratelimit_timeout = settings.ratelimit_timeout
#req = requests.get(f"https://xxivproduction.video/w/bcsWRyy1XgNdd9RdMriM9F")
#print(req.text)
bot.run()

29
settings.py Normal file
View file

@ -0,0 +1,29 @@
### Authorization settings ###
# Peertube instance URL
instance = "xxivproduction.video"
# Credentials of user with MODERATOR role on this instance or room
username = "USERNAME"
password = "PASSWORD"
# Room UUID (from direct chat URL)
# To find one, click the chain icon in upper right corner in chat area on stream page. Example:
# Direct chat URL looks like this https://xxivproduction.video/plugins/livechat/router/webchat/room/4d393887-f850-eeee-bbbb-0178976ad016
# You are looking for "4d393887-f850-eeee-bbbb-0178976ad016" part of it
# You can also find Room UUID in Share -> Embed menu
room = "f3e0a5fe-e0be-42c4-b3ab-9e42cf5ecc2e"
### Moderation settings ###
# Timeout length, if !timeout command was send without 2nd argument
timeout_default = 300
# Rate limit settings
# Time window (in seconds) to measure rates
ratelimit_timewindow = 10
# How many messages one user allowed to send in 'ratelimit_timewindow' seconds
ratelimit_maxmessages = 4
# Length of timeout (in seconds) that will be applied to user who exceeded the limits
ratelimit_timeout = 30

549
xmltodict.py Normal file
View file

@ -0,0 +1,549 @@
#!/usr/bin/env python
"Makes working with XML feel like you are working with JSON"
"""
Grabbed from https://github.com/martinblech/xmltodict
Copyright (C) 2012 Martin Blech and individual contributors.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
try:
from defusedexpat import pyexpat as expat
except ImportError:
from xml.parsers import expat
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesImpl
try: # pragma no cover
from cStringIO import StringIO
except ImportError: # pragma no cover
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
from collections import OrderedDict
from inspect import isgenerator
try: # pragma no cover
_basestring = basestring
except NameError: # pragma no cover
_basestring = str
try: # pragma no cover
_unicode = unicode
except NameError: # pragma no cover
_unicode = str
__author__ = 'Martin Blech'
__version__ = '0.12.0'
__license__ = 'MIT'
class ParsingInterrupted(Exception):
pass
class _DictSAXHandler(object):
def __init__(self,
item_depth=0,
item_callback=lambda *args: True,
xml_attribs=True,
attr_prefix='@',
cdata_key='#text',
force_cdata=False,
cdata_separator='',
postprocessor=None,
dict_constructor=OrderedDict,
strip_whitespace=True,
namespace_separator=':',
namespaces=None,
force_list=None,
comment_key='#comment'):
self.path = []
self.stack = []
self.data = []
self.item = None
self.item_depth = item_depth
self.xml_attribs = xml_attribs
self.item_callback = item_callback
self.attr_prefix = attr_prefix
self.cdata_key = cdata_key
self.force_cdata = force_cdata
self.cdata_separator = cdata_separator
self.postprocessor = postprocessor
self.dict_constructor = dict_constructor
self.strip_whitespace = strip_whitespace
self.namespace_separator = namespace_separator
self.namespaces = namespaces
self.namespace_declarations = OrderedDict()
self.force_list = force_list
self.comment_key = comment_key
def _build_name(self, full_name):
if self.namespaces is None:
return full_name
i = full_name.rfind(self.namespace_separator)
if i == -1:
return full_name
namespace, name = full_name[:i], full_name[i+1:]
try:
short_namespace = self.namespaces[namespace]
except KeyError:
short_namespace = namespace
if not short_namespace:
return name
else:
return self.namespace_separator.join((short_namespace, name))
def _attrs_to_dict(self, attrs):
if isinstance(attrs, dict):
return attrs
return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
def startNamespaceDecl(self, prefix, uri):
self.namespace_declarations[prefix or ''] = uri
def startElement(self, full_name, attrs):
name = self._build_name(full_name)
attrs = self._attrs_to_dict(attrs)
if attrs and self.namespace_declarations:
attrs['xmlns'] = self.namespace_declarations
self.namespace_declarations = OrderedDict()
self.path.append((name, attrs or None))
if len(self.path) > self.item_depth:
self.stack.append((self.item, self.data))
if self.xml_attribs:
attr_entries = []
for key, value in attrs.items():
key = self.attr_prefix+self._build_name(key)
if self.postprocessor:
entry = self.postprocessor(self.path, key, value)
else:
entry = (key, value)
if entry:
attr_entries.append(entry)
attrs = self.dict_constructor(attr_entries)
else:
attrs = None
self.item = attrs or None
self.data = []
def endElement(self, full_name):
name = self._build_name(full_name)
if len(self.path) == self.item_depth:
item = self.item
if item is None:
item = (None if not self.data
else self.cdata_separator.join(self.data))
should_continue = self.item_callback(self.path, item)
if not should_continue:
raise ParsingInterrupted()
if len(self.stack):
data = (None if not self.data
else self.cdata_separator.join(self.data))
item = self.item
self.item, self.data = self.stack.pop()
if self.strip_whitespace and data:
data = data.strip() or None
if data and self.force_cdata and item is None:
item = self.dict_constructor()
if item is not None:
if data:
self.push_data(item, self.cdata_key, data)
self.item = self.push_data(self.item, name, item)
else:
self.item = self.push_data(self.item, name, data)
else:
self.item = None
self.data = []
self.path.pop()
def characters(self, data):
if not self.data:
self.data = [data]
else:
self.data.append(data)
def comments(self, data):
if self.strip_whitespace:
data = data.strip()
self.item = self.push_data(self.item, self.comment_key, data)
def push_data(self, item, key, data):
if self.postprocessor is not None:
result = self.postprocessor(self.path, key, data)
if result is None:
return item
key, data = result
if item is None:
item = self.dict_constructor()
try:
value = item[key]
if isinstance(value, list):
value.append(data)
else:
item[key] = [value, data]
except KeyError:
if self._should_force_list(key, data):
item[key] = [data]
else:
item[key] = data
return item
def _should_force_list(self, key, value):
if not self.force_list:
return False
if isinstance(self.force_list, bool):
return self.force_list
try:
return key in self.force_list
except TypeError:
return self.force_list(self.path[:-1], key, value)
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
namespace_separator=':', disable_entities=True, process_comments=False, **kwargs):
"""Parse the given XML input and convert it into a dictionary.
`xml_input` can either be a `string`, a file-like object, or a generator of strings.
If `xml_attribs` is `True`, element attributes are put in the dictionary
among regular child elements, using `@` as a prefix to avoid collisions. If
set to `False`, they are just ignored.
Simple example::
>>> import xmltodict
>>> doc = xmltodict.parse(\"\"\"
... <a prop="x">
... <b>1</b>
... <b>2</b>
... </a>
... \"\"\")
>>> doc['a']['@prop']
u'x'
>>> doc['a']['b']
[u'1', u'2']
If `item_depth` is `0`, the function returns a dictionary for the root
element (default behavior). Otherwise, it calls `item_callback` every time
an item at the specified depth is found and returns `None` in the end
(streaming mode).
The callback function receives two parameters: the `path` from the document
root to the item (name-attribs pairs), and the `item` (dict). If the
callback's return value is false-ish, parsing will be stopped with the
:class:`ParsingInterrupted` exception.
Streaming example::
>>> def handle(path, item):
... print('path:%s item:%s' % (path, item))
... return True
...
>>> xmltodict.parse(\"\"\"
... <a prop="x">
... <b>1</b>
... <b>2</b>
... </a>\"\"\", item_depth=2, item_callback=handle)
path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
The optional argument `postprocessor` is a function that takes `path`,
`key` and `value` as positional arguments and returns a new `(key, value)`
pair where both `key` and `value` may have changed. Usage example::
>>> def postprocessor(path, key, value):
... try:
... return key + ':int', int(value)
... except (ValueError, TypeError):
... return key, value
>>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
... postprocessor=postprocessor)
OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))])
You can pass an alternate version of `expat` (such as `defusedexpat`) by
using the `expat` parameter. E.g:
>>> import defusedexpat
>>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
OrderedDict([(u'a', u'hello')])
You can use the force_list argument to force lists to be created even
when there is only a single child of a given level of hierarchy. The
force_list argument is a tuple of keys. If the key for a given level
of hierarchy is in the force_list argument, that level of hierarchy
will have a list as a child (even if there is only one sub-element).
The index_keys operation takes precedence over this. This is applied
after any user-supplied postprocessor has already run.
For example, given this input:
<servers>
<server>
<name>host1</name>
<os>Linux</os>
<interfaces>
<interface>
<name>em0</name>
<ip_address>10.0.0.1</ip_address>
</interface>
</interfaces>
</server>
</servers>
If called with force_list=('interface',), it will produce
this dictionary:
{'servers':
{'server':
{'name': 'host1',
'os': 'Linux'},
'interfaces':
{'interface':
[ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } }
`force_list` can also be a callable that receives `path`, `key` and
`value`. This is helpful in cases where the logic that decides whether
a list should be forced is more complex.
If `process_comment` is `True` then comment will be added with comment_key
(default=`'#comment'`) to then tag which contains comment
For example, given this input:
<a>
<b>
<!-- b comment -->
<c>
<!-- c comment -->
1
</c>
<d>2</d>
</b>
</a>
If called with process_comment=True, it will produce
this dictionary:
'a': {
'b': {
'#comment': 'b comment',
'c': {
'#comment': 'c comment',
'#text': '1',
},
'd': '2',
},
}
"""
handler = _DictSAXHandler(namespace_separator=namespace_separator,
**kwargs)
if isinstance(xml_input, _unicode):
if not encoding:
encoding = 'utf-8'
xml_input = xml_input.encode(encoding)
if not process_namespaces:
namespace_separator = None
parser = expat.ParserCreate(
encoding,
namespace_separator
)
try:
parser.ordered_attributes = True
except AttributeError:
# Jython's expat does not support ordered_attributes
pass
parser.StartNamespaceDeclHandler = handler.startNamespaceDecl
parser.StartElementHandler = handler.startElement
parser.EndElementHandler = handler.endElement
parser.CharacterDataHandler = handler.characters
if process_comments:
parser.CommentHandler = handler.comments
parser.buffer_text = True
if disable_entities:
try:
# Attempt to disable DTD in Jython's expat parser (Xerces-J).
feature = "http://apache.org/xml/features/disallow-doctype-decl"
parser._reader.setFeature(feature, True)
except AttributeError:
# For CPython / expat parser.
# Anything not handled ends up here and entities aren't expanded.
parser.DefaultHandler = lambda x: None
# Expects an integer return; zero means failure -> expat.ExpatError.
parser.ExternalEntityRefHandler = lambda *x: 1
if hasattr(xml_input, 'read'):
parser.ParseFile(xml_input)
elif isgenerator(xml_input):
for chunk in xml_input:
parser.Parse(chunk,False)
parser.Parse(b'',True)
else:
parser.Parse(xml_input, True)
return handler.item
def _process_namespace(name, namespaces, ns_sep=':', attr_prefix='@'):
if not namespaces:
return name
try:
ns, name = name.rsplit(ns_sep, 1)
except ValueError:
pass
else:
ns_res = namespaces.get(ns.strip(attr_prefix))
name = '{}{}{}{}'.format(
attr_prefix if ns.startswith(attr_prefix) else '',
ns_res, ns_sep, name) if ns_res else name
return name
def _emit(key, value, content_handler,
attr_prefix='@',
cdata_key='#text',
depth=0,
preprocessor=None,
pretty=False,
newl='\n',
indent='\t',
namespace_separator=':',
namespaces=None,
full_document=True,
expand_iter=None):
key = _process_namespace(key, namespaces, namespace_separator, attr_prefix)
if preprocessor is not None:
result = preprocessor(key, value)
if result is None:
return
key, value = result
if (not hasattr(value, '__iter__')
or isinstance(value, _basestring)
or isinstance(value, dict)):
value = [value]
for index, v in enumerate(value):
if full_document and depth == 0 and index > 0:
raise ValueError('document with multiple roots')
if v is None:
v = OrderedDict()
elif isinstance(v, bool):
if v:
v = _unicode('true')
else:
v = _unicode('false')
elif not isinstance(v, dict):
if expand_iter and hasattr(v, '__iter__') and not isinstance(v, _basestring):
v = OrderedDict(((expand_iter, v),))
else:
v = _unicode(v)
if isinstance(v, _basestring):
v = OrderedDict(((cdata_key, v),))
cdata = None
attrs = OrderedDict()
children = []
for ik, iv in v.items():
if ik == cdata_key:
cdata = iv
continue
if ik.startswith(attr_prefix):
ik = _process_namespace(ik, namespaces, namespace_separator,
attr_prefix)
if ik == '@xmlns' and isinstance(iv, dict):
for k, v in iv.items():
attr = 'xmlns{}'.format(':{}'.format(k) if k else '')
attrs[attr] = _unicode(v)
continue
if not isinstance(iv, _unicode):
iv = _unicode(iv)
attrs[ik[len(attr_prefix):]] = iv
continue
children.append((ik, iv))
if pretty:
content_handler.ignorableWhitespace(depth * indent)
content_handler.startElement(key, AttributesImpl(attrs))
if pretty and children:
content_handler.ignorableWhitespace(newl)
for child_key, child_value in children:
_emit(child_key, child_value, content_handler,
attr_prefix, cdata_key, depth+1, preprocessor,
pretty, newl, indent, namespaces=namespaces,
namespace_separator=namespace_separator,
expand_iter=expand_iter)
if cdata is not None:
content_handler.characters(cdata)
if pretty and children:
content_handler.ignorableWhitespace(depth * indent)
content_handler.endElement(key)
if pretty and depth:
content_handler.ignorableWhitespace(newl)
def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
short_empty_elements=False,
**kwargs):
"""Emit an XML document for the given `input_dict` (reverse of `parse`).
The resulting XML document is returned as a string, but if `output` (a
file-like object) is specified, it is written there instead.
Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
as XML node attributes, whereas keys equal to `cdata_key`
(default=`'#text'`) are treated as character data.
The `pretty` parameter (default=`False`) enables pretty-printing. In this
mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
can be customized with the `newl` and `indent` parameters.
"""
if full_document and len(input_dict) != 1:
raise ValueError('Document must have exactly one root.')
must_return = False
if output is None:
output = StringIO()
must_return = True
if short_empty_elements:
content_handler = XMLGenerator(output, encoding, True)
else:
content_handler = XMLGenerator(output, encoding)
if full_document:
content_handler.startDocument()
for key, value in input_dict.items():
_emit(key, value, content_handler, full_document=full_document,
**kwargs)
if full_document:
content_handler.endDocument()
if must_return:
value = output.getvalue()
try: # pragma no cover
value = value.decode(encoding)
except AttributeError: # pragma no cover
pass
return value
if __name__ == '__main__': # pragma: no cover
import sys
import marshal
try:
stdin = sys.stdin.buffer
stdout = sys.stdout.buffer
except AttributeError:
stdin = sys.stdin
stdout = sys.stdout
(item_depth,) = sys.argv[1:]
item_depth = int(item_depth)
def handle_item(path, item):
marshal.dump((path, item), stdout)
return True
try:
root = parse(stdin,
item_depth=item_depth,
item_callback=handle_item,
dict_constructor=dict)
if item_depth == 0:
handle_item([], root)
except KeyboardInterrupt:
pass