-
Notifications
You must be signed in to change notification settings - Fork 44.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'dev' into ntindle/open-2097-integration-test-complete-t…
…he-tutorial
- Loading branch information
Showing
7 changed files
with
283 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
110 changes: 110 additions & 0 deletions
110
autogpt_platform/backend/backend/blocks/code_extraction_block.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import re | ||
|
||
from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema | ||
from backend.data.model import SchemaField | ||
|
||
|
||
class CodeExtractionBlock(Block): | ||
class Input(BlockSchema): | ||
text: str = SchemaField( | ||
description="Text containing code blocks to extract (e.g., AI response)", | ||
placeholder="Enter text containing code blocks", | ||
) | ||
|
||
class Output(BlockSchema): | ||
html: str = SchemaField(description="Extracted HTML code") | ||
css: str = SchemaField(description="Extracted CSS code") | ||
javascript: str = SchemaField(description="Extracted JavaScript code") | ||
python: str = SchemaField(description="Extracted Python code") | ||
sql: str = SchemaField(description="Extracted SQL code") | ||
java: str = SchemaField(description="Extracted Java code") | ||
cpp: str = SchemaField(description="Extracted C++ code") | ||
csharp: str = SchemaField(description="Extracted C# code") | ||
json_code: str = SchemaField(description="Extracted JSON code") | ||
bash: str = SchemaField(description="Extracted Bash code") | ||
php: str = SchemaField(description="Extracted PHP code") | ||
ruby: str = SchemaField(description="Extracted Ruby code") | ||
yaml: str = SchemaField(description="Extracted YAML code") | ||
markdown: str = SchemaField(description="Extracted Markdown code") | ||
typescript: str = SchemaField(description="Extracted TypeScript code") | ||
xml: str = SchemaField(description="Extracted XML code") | ||
remaining_text: str = SchemaField( | ||
description="Remaining text after code extraction" | ||
) | ||
|
||
def __init__(self): | ||
super().__init__( | ||
id="d3a7d896-3b78-4f44-8b4b-48fbf4f0bcd8", | ||
description="Extracts code blocks from text and identifies their programming languages", | ||
categories={BlockCategory.TEXT}, | ||
input_schema=CodeExtractionBlock.Input, | ||
output_schema=CodeExtractionBlock.Output, | ||
test_input={ | ||
"text": "Here's a Python example:\n```python\nprint('Hello World')\n```\nAnd some HTML:\n```html\n<h1>Title</h1>\n```" | ||
}, | ||
test_output=[ | ||
("html", "<h1>Title</h1>"), | ||
("python", "print('Hello World')"), | ||
("remaining_text", "Here's a Python example:\nAnd some HTML:"), | ||
], | ||
) | ||
|
||
def run(self, input_data: Input, **kwargs) -> BlockOutput: | ||
# List of supported programming languages with mapped aliases | ||
language_aliases = { | ||
"html": ["html", "htm"], | ||
"css": ["css"], | ||
"javascript": ["javascript", "js"], | ||
"python": ["python", "py"], | ||
"sql": ["sql"], | ||
"java": ["java"], | ||
"cpp": ["cpp", "c++"], | ||
"csharp": ["csharp", "c#", "cs"], | ||
"json_code": ["json"], | ||
"bash": ["bash", "shell", "sh"], | ||
"php": ["php"], | ||
"ruby": ["ruby", "rb"], | ||
"yaml": ["yaml", "yml"], | ||
"markdown": ["markdown", "md"], | ||
"typescript": ["typescript", "ts"], | ||
"xml": ["xml"], | ||
} | ||
|
||
# Extract code for each language | ||
for canonical_name, aliases in language_aliases.items(): | ||
code = "" | ||
# Try each alias for the language | ||
for alias in aliases: | ||
code_for_alias = self.extract_code(input_data.text, alias) | ||
if code_for_alias: | ||
code = code + "\n\n" + code_for_alias if code else code_for_alias | ||
|
||
if code: # Only yield if there's actual code content | ||
yield canonical_name, code | ||
|
||
# Remove all code blocks from the text to get remaining text | ||
pattern = ( | ||
r"```(?:" | ||
+ "|".join( | ||
re.escape(alias) | ||
for aliases in language_aliases.values() | ||
for alias in aliases | ||
) | ||
+ r")\s+[\s\S]*?```" | ||
) | ||
|
||
remaining_text = re.sub(pattern, "", input_data.text).strip() | ||
remaining_text = re.sub(r"\n\s*\n", "\n", remaining_text) | ||
|
||
if remaining_text: # Only yield if there's remaining text | ||
yield "remaining_text", remaining_text | ||
|
||
def extract_code(self, text: str, language: str) -> str: | ||
# Escape special regex characters in the language string | ||
language = re.escape(language) | ||
# Extract all code blocks enclosed in ```language``` blocks | ||
pattern = re.compile(rf"```{language}\s+(.*?)```", re.DOTALL | re.IGNORECASE) | ||
matches = pattern.finditer(text) | ||
# Combine all code blocks for this language with newlines between them | ||
code_blocks = [match.group(1).strip() for match in matches] | ||
return "\n\n".join(code_blocks) if code_blocks else "" |
Oops, something went wrong.