-
Notifications
You must be signed in to change notification settings - Fork 4.3k
/
append_index_html_to_internal_links.py
127 lines (111 loc) · 4.36 KB
/
append_index_html_to_internal_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Script to fix the links in the staged website.
Finds all internal links which do not have index.html at the end and appends
index.html in the appropriate place (preserving anchors, etc).
Usage:
From root directory, after running the jekyll build, execute
'python .jenkins/append_index_html_to_internal_links.py'.
Dependencies:
beautifulsoup4
Installable via pip as 'sudo pip install beautifulsoup4' or apt via
'sudo apt-get install python-beautifulsoup4'.
"""
from __future__ import print_function
import argparse
import fnmatch
import os
import re
from bs4 import BeautifulSoup
try:
unicode # pylint: disable=unicode-builtin
except NameError:
unicode = str
# Original link match. Matches any string which starts with '/' and doesn't
# have a file extension.
linkMatch = r'^\/(.*\.(?!([^\/]+)$))?[^.]*$'
# Regex which matches strings of type /internal/link/#anchor. Breaks into two
# groups for ease of inserting 'index.html'.
anchorMatch1 = r'(.+\/)(#[^\/]+$)'
# Regex which matches strings of type /internal/link#anchor. Breaks into two
# groups for ease of inserting 'index.html'.
anchorMatch2 = r'(.+\/[a-zA-Z0-9]+)(#[^\/]+$)'
parser = argparse.ArgumentParser(description='Fix links in the staged website.')
parser.add_argument('content_dir', help='Generated content directory to fix links in')
args = parser.parse_args()
matches = []
# Recursively walk content directory and find all html files.
for root, dirnames, filenames in os.walk(args.content_dir):
for filename in fnmatch.filter(filenames, '*.html'):
# Javadoc does not have the index.html problem, so omit it.
if 'javadoc' not in root:
matches.append(os.path.join(root, filename))
print('Matches: ' + str(len(matches)))
# Iterates over each matched file looking for link matches.
for match in matches:
print('Fixing links in: ' + match)
mf = open(match)
soup = BeautifulSoup(mf)
# Iterates over every <meta> which is used for aliases - redirected links
for meta in soup.findAll('meta'):
try:
content = meta['content']
alias = content.replace('0; url=', '')
if re.match(linkMatch, alias) is not None:
if alias.endswith('/'):
# /internal/link/
meta['content'] = content + 'index.html'
else:
# /internal/link
meta['content'] = content + '/index.html'
mf.close()
html = unicode(soup).encode('utf-8')
# Write back to the file.
with open(match, "wb") as f:
print('Replacing ' + content + ' with: ' + meta['content'])
f.write(html)
except KeyError as e:
# Some <meta> tags don't have url.
continue
# Iterates over every <a>
for a in soup.findAll('a'):
try:
hr = a['href']
if re.match(linkMatch, hr) is not None:
if hr.endswith('/'):
# /internal/link/
a['href'] = hr + 'index.html'
elif re.match(anchorMatch1, hr) is not None:
# /internal/link/#anchor
mat = re.match(anchorMatch1, hr)
a['href'] = mat.group(1) + 'index.html' + mat.group(2)
elif re.match(anchorMatch2, hr) is not None:
# /internal/link#anchor
mat = re.match(anchorMatch2, hr)
a['href'] = mat.group(1) + '/index.html' + mat.group(2)
else:
# /internal/link
a['href'] = hr + '/index.html'
mf.close()
html = unicode(soup).encode('utf-8')
# Write back to the file.
with open(match, "wb") as f:
print('Replacing ' + hr + ' with: ' + a['href'])
f.write(html)
except KeyError as e:
# Some <a> tags don't have an href.
continue