Skip to content

Commit f6afda7

Browse files
Add language support (#210)
Add language details to embedded properties and root microformats using the following order of specificity: - embedded properties (class=e-* lang=..) - root microformats (class=h-* lang=..) - document root (<html lang=..>)
1 parent cf322c4 commit f6afda7

File tree

5 files changed

+49
-6
lines changed

5 files changed

+49
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file.
55
- make relative URLs in e-* properties absolute (#201)
66
- fix whitespace in plaintext conversion (#207)
77
- add srcset support (#209)
8+
- add language support (#210)
89

910
## 1.1.3 - 2023-06-28
1011
- reduce instances where photo is implied (#135)

mf2py/parse_property.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,13 +94,20 @@ def datetime(el, default_date=None):
9494
)
9595

9696

97-
def embedded(el, base_url=""):
97+
def embedded(el, root_lang, document_lang, base_url=""):
9898
"""Process e-* properties"""
9999
for tag in el.find_all():
100100
for attr in ("href", "src", "cite", "data", "poster"):
101101
if attr in tag.attrs:
102102
tag.attrs[attr] = try_urljoin(base_url, tag.attrs[attr])
103-
return {
103+
prop_value = {
104104
"html": el.decode_contents().strip(), # secret bs4 method to get innerHTML
105105
"value": get_textContent(el, replace_img=True, base_url=base_url),
106106
}
107+
if lang := el.attrs.get("lang"):
108+
prop_value["lang"] = lang
109+
elif root_lang:
110+
prop_value["lang"] = root_lang
111+
elif document_lang:
112+
prop_value["lang"] = document_lang
113+
return prop_value

mf2py/parser.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def __init__(self, doc=None, url=None, html_parser=None):
6868
"version": __version__,
6969
},
7070
}
71+
self.lang = None
7172

7273
# use default parser if none specified
7374
self.__html_parser__ = html_parser or "html5lib"
@@ -128,6 +129,8 @@ def __init__(self, doc=None, url=None, html_parser=None):
128129
self.__url__ = try_urljoin(self.__url__, poss_base_url)
129130

130131
if self.__doc__ is not None:
132+
if document := self.__doc__.find("html"):
133+
self.lang = document.attrs.get("lang")
131134
# parse!
132135
self.parse()
133136

@@ -161,13 +164,15 @@ def handle_microformat(
161164
el = backcompat.apply_rules(el, self.__html_parser__)
162165
root_class_names = mf2_classes.root(el.get("class", []))
163166

167+
root_lang = el.attrs.get("lang")
168+
164169
# parse for properties and children
165170
for child in get_children(el):
166171
(
167172
child_props,
168173
child_children,
169174
child_parsed_types_aggregation,
170-
) = parse_props(child)
175+
) = parse_props(child, root_lang)
171176
for key, new_value in child_props.items():
172177
prop_value = properties.get(key, [])
173178
prop_value.extend(new_value)
@@ -239,9 +244,13 @@ def handle_microformat(
239244
else:
240245
microformat["value"] = simple_value
241246

247+
if root_lang:
248+
microformat["lang"] = root_lang
249+
elif self.lang:
250+
microformat["lang"] = self.lang
242251
return microformat
243252

244-
def parse_props(el):
253+
def parse_props(el, root_lang):
245254
"""Parse the properties from a single element"""
246255
props = {}
247256
children = []
@@ -363,7 +372,7 @@ def parse_props(el):
363372
embedded_el = copy.copy(embedded_el)
364373
temp_fixes.rm_templates(embedded_el)
365374
e_value = parse_property.embedded(
366-
embedded_el, base_url=self.__url__
375+
embedded_el, root_lang, self.lang, base_url=self.__url__
367376
)
368377

369378
if root_class_names:
@@ -394,7 +403,7 @@ def parse_props(el):
394403
child_properties,
395404
child_microformats,
396405
child_parsed_types_aggregation,
397-
) = parse_props(child)
406+
) = parse_props(child, root_lang)
398407
for prop_name in child_properties:
399408
v = props.get(prop_name, [])
400409
v.extend(child_properties[prop_name])

test/examples/language.html

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<html lang="it">
2+
<div class="h-card">
3+
<h1 class="p-name">Romero</h1>
4+
</div>
5+
<div class="h-entry">
6+
<h1 class="p-name">Un titolo italiano</h1>
7+
<div class="e-content" lang="en">With an <em>english</em> summary</div>
8+
<div class="e-content">Con un riassunto <em>italiano</em></div>
9+
</div>
10+
<div class="h-entry" lang="sv">
11+
<h1 class="p-name">En svensk titel</h1>
12+
<div class="e-content" lang="en">With an <em>english</em> summary</div>
13+
<div class="e-content">Och <em>svensk</em> huvudtext</div>
14+
</div>
15+
</html>

test/test_parser.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,3 +1116,14 @@ def test_all_u_cases():
11161116
make_labelled_cmp("all_u_cases_" + str(i))(
11171117
"http://example.com/test", result["items"][0]["properties"]["url"][i]
11181118
)
1119+
1120+
1121+
def test_language():
1122+
result = parse_fixture("language.html")
1123+
assert result["items"][0]["lang"] == "it"
1124+
assert result["items"][1]["lang"] == "it"
1125+
assert result["items"][1]["properties"]["content"][0]["lang"] == "en"
1126+
assert result["items"][1]["properties"]["content"][1]["lang"] == "it"
1127+
assert result["items"][2]["lang"] == "sv"
1128+
assert result["items"][2]["properties"]["content"][0]["lang"] == "en"
1129+
assert result["items"][2]["properties"]["content"][1]["lang"] == "sv"

0 commit comments

Comments
 (0)