Skip to content

Commit

Permalink
[.NET] Japanese CJK configuration + DateExtractor refinements (#2575)
Browse files Browse the repository at this point in the history
* Japanese CJK configuration + DateExtractor support

* Fixed named group in JavaScript

* Localized example comments in Korean config files

* Removed duplicate test cases

Co-authored-by: LionbridgeCS2 <[email protected]>
  • Loading branch information
aitelint and LionbridgeCS2 authored Apr 30, 2021
1 parent 9778199 commit 3cf716e
Show file tree
Hide file tree
Showing 66 changed files with 2,927 additions and 6,726 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ public static class DateTimeDefinitions
public static readonly string DateThisRegex = $@"(这个|这一个|这|这一|本){WeekDayRegex}";
public static readonly string DateLastRegex = $@"(上一个|上个|上一|上|最后一个|最后)(的)?{WeekDayRegex}";
public static readonly string DateNextRegex = $@"(下一个|下个|下一|下)(的)?{WeekDayRegex}";
public const string SpecialMonthRegex = @"^[.]";
public const string SpecialYearRegex = @"^[.]";
public const string SpecialDayRegex = @"(最近|前天|后天|昨天|明天|今天|今日|明日|昨日|大后天|大前天|後天|大後天)";
public const string SpecialDayWithNumRegex = @"^[.]";
public static readonly string WeekDayOfMonthRegex = $@"((({MonthRegex}|{MonthNumRegex})的\s*)(?<cardinal>第一个|第二个|第三个|第四个|第五个|最后一个)\s*{WeekDayRegex})";
Expand Down Expand Up @@ -75,6 +77,7 @@ public static class DateTimeDefinitions
public static readonly string MonthSuffixRegex = $@"(?<msuf>({RelativeMonthRegex}|{MonthRegex}))";
public static readonly string SimpleCasesRegex = $@"((从)\s*)?(({YearRegex}|{DatePeriodYearInCJKRegex})\s*)?{MonthSuffixRegex}({DatePeriodDayRegexInCJK}|{DayRegex})\s*{DatePeriodTillRegex}\s*({DatePeriodDayRegexInCJK}|{DayRegex})((\s+|\s*,\s*){YearRegex})?";
public static readonly string YearAndMonth = $@"({DatePeriodYearInCJKRegex}|{YearRegex})\s*{MonthRegex}";
public static readonly string SimpleYearAndMonth = $@"({YearNumRegex}[/\\\-]{MonthNumRegex}\b$)";
public static readonly string PureNumYearAndMonth = $@"({YearRegexInNumber}\s*[-\.\/]\s*{MonthNumRegex})|({MonthNumRegex}\s*\/\s*{YearRegexInNumber})";
public static readonly string OneWordPeriodRegex = $@"(((?<yearrel>(明|今|去)年)\s*)?{MonthRegex}|({DatePeriodThisRegex}|{DatePeriodLastRegex}|{DatePeriodNextRegex})(?<halfTag>半)?\s*(周末|周|月|年)|周末|(今|明|去|前|后)年(\s*{HalfYearRegex})?)";
public static readonly string WeekOfMonthRegex = $@"(?<wom>{MonthSuffixRegex}的(?<cardinal>第一|第二|第三|第四|第五|最后一)\s*周\s*)";
Expand All @@ -86,6 +89,8 @@ public static class DateTimeDefinitions
public static readonly string YearToYearSuffixRequired = $@"({DateRangePrepositions})({DatePeriodYearInCJKRegex}|{YearRegex})\s*({DatePeriodTillSuffixRequiredRegex})\s*({DatePeriodYearInCJKRegex}|{YearRegex})\s*(之间|之内|期间|中间|间)";
public static readonly string MonthToMonth = $@"({DateRangePrepositions})({MonthRegex}){DatePeriodTillRegex}({MonthRegex})";
public static readonly string MonthToMonthSuffixRequired = $@"({DateRangePrepositions})({MonthRegex}){DatePeriodTillSuffixRequiredRegex}({MonthRegex})\s*(之间|之内|期间|中间|间)";
public const string DayToDay = @"^[.]";
public const string DayRegexForPeriod = @"^[.]";
public const string PastRegex = @"(?<past>(之前|前|上|近|过去))";
public const string FutureRegex = @"(?<future>(之后|之後|后|後|(?<![一两几]\s*)下|未来(的)?))";
public const string SeasonRegex = @"(?<season>春|夏|秋|冬)(天|季)?";
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ public static class NumbersDefinitions
public const string PointRegexStr = @"[\..・]";
public static readonly string AllFloatRegex = $@"{NegativeNumberTermsRegex}?{AllIntRegex}\s*{PointRegexStr}\s*[一二三四五六七八九](\s*{ZeroToNineIntegerRegex})*";
public static readonly string NumbersWithAllowListRegex = $@"(?<!(離は))({NegativeNumberTermsRegex}?({NotSingleRegex}|{SingleRegex})(?!({AllIntRegex}*([、.]{ZeroToNineIntegerRegex}+)*|{AllFloatRegex})*\s*{PercentageRegex}+))(?!(\s*{AllMultiplierLookupRegex}))";
public static readonly string NumbersAggressiveRegex = $@"(({AllIntRegex})(?!({AllIntRegex}*([、.]{ZeroToNineIntegerRegex}+)*|{AllFloatRegex})*(\s*{PercentageRegex})?))";
public static readonly string NumbersAggressiveRegex = $@"(({AllIntRegex})(?!({AllIntRegex}|([、.]{ZeroToNineIntegerRegex})|{AllFloatRegex}|\s*{PercentageRegex})))";
public static readonly string PointRegex = $@"{PointRegexStr}";
public static readonly string DoubleSpecialsChars = $@"((?<!({ZeroToNineFullHalfRegex}+[\..]{ZeroToNineFullHalfRegex}*))({NegativeNumberTermsRegexNum}\s*)?{ZeroToNineFullHalfRegex}+[\..,]{ZeroToNineFullHalfRegex}+(?!({ZeroToNineFullHalfRegex}*[\..,]{ZeroToNineFullHalfRegex}+)))(?=\b|\D)(?!\s*{AllMultiplierLookupRegex})";
public static readonly string DoubleRoundNumberSpecialsChars = $@"(?<!(({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})+[\..・,]({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})*))(({NegativeNumberTermsRegexNum}|{NegativeNumberTermsRegex})\s*)?({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})+[\..・,]({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})+(?!({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})*[\..・,]({ZeroToNineIntegerRegex}|{RoundNumberIntegerRegex})+)";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ public static class DateTimeDefinitions
public static readonly string DateThisRegex = $@"(이번\s?주?)\s*{WeekDayRegex}";
public static readonly string DateLastRegex = $@"((저번|지난)\s?주?)\s*{WeekDayRegex}";
public static readonly string DateNextRegex = $@"(다음\s?주?)\s*{WeekDayRegex}";
public const string SpecialMonthRegex = @"^[.]";
public const string SpecialYearRegex = @"^[.]";
public const string SpecialDayRegex = @"(최근|그저께|그제|((내일)?\s?모레)|그끄저께|어제|내일|오늘|금일|작일|익일|당일|명일|전일)";
public const string SpecialDayWithNumRegex = @"^[.]";
public static readonly string WeekDayOfMonthRegex = $@"((({MonthRegex}|{MonthNumRegex}(월|달))의?\s*)?(?<cardinal>첫\s?번?째|두\s?번째|둘째|세\s?번째|셋째|네\s?번째|넷째|다섯\s?번?째|다섯째|여섯\s?번?째|여섯째|마지막)\s*{WeekDayRegex})";
Expand Down Expand Up @@ -75,6 +77,7 @@ public static class DateTimeDefinitions
public static readonly string MonthSuffixRegex = $@"(?<msuf>({RelativeMonthRegex}|{MonthRegex}))";
public static readonly string SimpleCasesRegex = $@"((从)\s*)?(({YearRegex}|{DatePeriodYearInCJKRegex})\s*)?{MonthSuffixRegex}({DatePeriodDayRegexInCJK}|{DayRegex})\s*{DatePeriodTillRegex}\s*({DatePeriodDayRegexInCJK}|{DayRegex})((\s+|\s*,\s*){YearRegex})?";
public static readonly string YearAndMonth = $@"({DatePeriodYearInCJKRegex}|{YearRegex})\s*{MonthRegex}";
public static readonly string SimpleYearAndMonth = $@"({YearNumRegex}[/\\\-]{MonthNumRegex}\b$)";
public static readonly string PureNumYearAndMonth = $@"({YearRegexInNumber}\s*[-\.\/]\s*{MonthNumRegex})|({MonthNumRegex}\s*\/\s*{YearRegexInNumber})";
public static readonly string OneWordPeriodRegex = $@"(((?<yearrel>(明|今|去)年)\s*)?{MonthRegex}|({DatePeriodThisRegex}|{DatePeriodLastRegex}|{DatePeriodNextRegex})(?<halfTag>半)?\s*(周末|周|月|年)|周末|(今|明|去|前|后)年(\s*{HalfYearRegex})?)";
public static readonly string WeekOfMonthRegex = $@"(?<wom>{MonthSuffixRegex}的(?<cardinal>첫\s?번?째|두번째|둘째|세번째|셋째|네번째|넷째|마지막)\s*주\s*)";
Expand All @@ -86,6 +89,8 @@ public static class DateTimeDefinitions
public static readonly string YearToYearSuffixRequired = $@"({DateRangePrepositions})({DatePeriodYearInCJKRegex}|{YearRegex})\s*({DatePeriodTillSuffixRequiredRegex})\s*({DatePeriodYearInCJKRegex}|{YearRegex})\s*(之间|之内|期间|中间|间)";
public static readonly string MonthToMonth = $@"({DateRangePrepositions})({MonthRegex}){DatePeriodTillRegex}({MonthRegex})";
public static readonly string MonthToMonthSuffixRequired = $@"({DateRangePrepositions})({MonthRegex}){DatePeriodTillSuffixRequiredRegex}({MonthRegex})\s*(之间|之内|期间|中间|间)";
public const string DayToDay = @"^[.]";
public const string DayRegexForPeriod = @"^[.]";
public const string PastRegex = @"(?<past>(之前|前|上|近|过去))";
public const string FutureRegex = @"(?<future>(之后|之後|后|後|(?<![一两几]\s*)下|未来(的)?))";
public const string SeasonRegex = @"(?<season>春|夏|秋|冬)(天|季)?";
Expand Down
46 changes: 23 additions & 23 deletions .NET/Microsoft.Recognizers.Text.DataDrivenTests/TestHelpers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
using Microsoft.Recognizers.Text.DateTime.German;
using Microsoft.Recognizers.Text.DateTime.Hindi;
using Microsoft.Recognizers.Text.DateTime.Italian;
using Microsoft.Recognizers.Text.DateTime.Japanese;
using Microsoft.Recognizers.Text.DateTime.Korean;
using Microsoft.Recognizers.Text.DateTime.Portuguese;
using Microsoft.Recognizers.Text.DateTime.Spanish;
Expand Down Expand Up @@ -569,59 +570,58 @@ public static IDateTimeExtractor GetJapaneseExtractor(DateTimeExtractors extract
switch (extractorName)
{
case DateTimeExtractors.Date:
return new DateTime.Japanese.JapaneseDateExtractorConfiguration();
return new BaseCJKDateExtractor(new JapaneseDateExtractorConfiguration(defaultConfig));
case DateTimeExtractors.Time:
return new DateTime.Japanese.JapaneseTimeExtractorConfiguration();
return new BaseCJKTimeExtractor(new JapaneseTimeExtractorConfiguration(defaultConfig));
case DateTimeExtractors.DatePeriod:
return new DateTime.Japanese.JapaneseDatePeriodExtractorConfiguration();
return new BaseCJKDatePeriodExtractor(new JapaneseDatePeriodExtractorConfiguration(defaultConfig));
case DateTimeExtractors.TimePeriod:
return new DateTime.Japanese.JapaneseTimePeriodExtractorConfiguration();
return new BaseCJKTimePeriodExtractor(new JapaneseTimePeriodExtractorConfiguration(defaultConfig));
case DateTimeExtractors.DateTime:
return new DateTime.Japanese.JapaneseDateTimeExtractorConfiguration();
return new BaseCJKDateTimeExtractor(new JapaneseDateTimeExtractorConfiguration(defaultConfig));
case DateTimeExtractors.DateTimePeriod:
return new DateTime.Japanese.JapaneseDateTimePeriodExtractorConfiguration();
return new BaseCJKDateTimePeriodExtractor(new JapaneseDateTimePeriodExtractorConfiguration(defaultConfig));
case DateTimeExtractors.Duration:
return new DateTime.Japanese.JapaneseDurationExtractorConfiguration();
return new BaseCJKDurationExtractor(new JapaneseDurationExtractorConfiguration(defaultConfig));
case DateTimeExtractors.Holiday:
return new BaseHolidayExtractor(new DateTime.Japanese.JapaneseHolidayExtractorConfiguration(defaultConfig));
return new BaseCJKHolidayExtractor(new JapaneseHolidayExtractorConfiguration(defaultConfig));
case DateTimeExtractors.Set:
return new DateTime.Japanese.JapaneseSetExtractorConfiguration();
return new BaseCJKSetExtractor(new JapaneseSetExtractorConfiguration(defaultConfig));
case DateTimeExtractors.Merged:
return new DateTime.Japanese.JapaneseMergedExtractorConfiguration(defaultConfig);
return new BaseCJKMergedDateTimeExtractor(new JapaneseMergedExtractorConfiguration(defaultConfig));
case DateTimeExtractors.MergedSkipFromTo:
return new DateTime.Japanese.JapaneseMergedExtractorConfiguration(skipConfig);
return new BaseCJKMergedDateTimeExtractor(new JapaneseMergedExtractorConfiguration(skipConfig));
}

throw new Exception($"Extractor '{extractorName}' for Japanese not supported");
}

public static IDateTimeParser GetJapaneseParser(DateTimeParsers parserName)
{

var config = new BaseDateTimeOptionsConfiguration(Culture.Japanese, DateTimeOptions.None);
var config = new JapaneseCommonDateTimeParserConfiguration(new BaseDateTimeOptionsConfiguration(Culture.Japanese, DateTimeOptions.None));

switch (parserName)
{
case DateTimeParsers.Date:
return new DateTime.Japanese.JapaneseDateParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
return new BaseCJKDateParser(new JapaneseDateParserConfiguration(config));
case DateTimeParsers.Time:
return new DateTime.Japanese.JapaneseTimeParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
return new BaseCJKTimeParser(new JapaneseTimeParserConfiguration(config));
case DateTimeParsers.DatePeriod:
return new DateTime.Japanese.JapaneseDatePeriodParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
return new BaseCJKDatePeriodParser(new JapaneseDatePeriodParserConfiguration(config));
case DateTimeParsers.TimePeriod:
return new DateTime.Japanese.JapaneseTimePeriodParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
return new BaseCJKTimePeriodParser(new JapaneseTimePeriodParserConfiguration(config));
case DateTimeParsers.DateTime:
return new DateTime.Japanese.JapaneseDateTimeParser(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
return new BaseCJKDateTimeParser(new JapaneseDateTimeParserConfiguration(config));
case DateTimeParsers.DateTimePeriod:
return new DateTime.Japanese.JapaneseDateTimePeriodParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
return new BaseCJKDateTimePeriodParser(new JapaneseDateTimePeriodParserConfiguration(config));
case DateTimeParsers.Duration:
return new DateTime.Japanese.JapaneseDurationParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
return new BaseCJKDurationParser(new JapaneseDurationParserConfiguration(config));
case DateTimeParsers.Holiday:
return new DateTime.Japanese.JapaneseHolidayParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
return new BaseCJKHolidayParser(new JapaneseHolidayParserConfiguration(config));
case DateTimeParsers.Set:
return new DateTime.Japanese.JapaneseSetParserConfiguration(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
return new BaseCJKSetParser(new JapaneseSetParserConfiguration(config));
case DateTimeParsers.Merged:
return new FullDateTimeParser(new DateTime.Japanese.JapaneseDateTimeParserConfiguration(config));
return new BaseCJKMergedDateTimeParser(new JapaneseMergedParserConfiguration(config));
}

throw new Exception($"Parser '{parserName}' for Japanese not supported");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ public class ChineseDatePeriodExtractorConfiguration : BaseDateTimeOptionsConfig
// 2017.12, 2017-12, 2017/12, 12/2017
public static readonly Regex PureNumYearAndMonth = new Regex(DateTimeDefinitions.PureNumYearAndMonth, RegexFlags);

public static readonly Regex SimpleYearAndMonth = new Regex(DateTimeDefinitions.SimpleYearAndMonth, RegexFlags);

public static readonly Regex OneWordPeriodRegex = new Regex(DateTimeDefinitions.OneWordPeriodRegex, RegexFlags);

public static readonly Regex WeekOfMonthRegex = new Regex(DateTimeDefinitions.WeekOfMonthRegex, RegexFlags);
Expand All @@ -41,6 +43,10 @@ public class ChineseDatePeriodExtractorConfiguration : BaseDateTimeOptionsConfig

public static readonly Regex MonthToMonthSuffixRequired = new Regex(DateTimeDefinitions.MonthToMonthSuffixRequired, RegexFlags);

public static readonly Regex DayToDay = new Regex(DateTimeDefinitions.DayToDay, RegexFlags);

public static readonly Regex DayRegexForPeriod = new Regex(DateTimeDefinitions.DayRegexForPeriod, RegexFlags);

public static readonly Regex PastRegex = new Regex(DateTimeDefinitions.PastRegex, RegexFlags);

public static readonly Regex FutureRegex = new Regex(DateTimeDefinitions.FutureRegex, RegexFlags);
Expand All @@ -51,6 +57,10 @@ public class ChineseDatePeriodExtractorConfiguration : BaseDateTimeOptionsConfig

public static readonly Regex DecadeRegex = new Regex(DateTimeDefinitions.DecadeRegex, RegexFlags);

public static readonly Regex SpecialMonthRegex = new Regex(DateTimeDefinitions.SpecialMonthRegex, RegexFlags);

public static readonly Regex SpecialYearRegex = new Regex(DateTimeDefinitions.SpecialYearRegex, RegexFlags);

public static readonly Regex DayRegex = new Regex(DateTimeDefinitions.DayRegex, RegexFlags);
public static readonly Regex DayRegexInCJK = new Regex(DateTimeDefinitions.DatePeriodDayRegexInCJK, RegexFlags);
public static readonly Regex MonthNumRegex = new Regex(DateTimeDefinitions.MonthNumRegex, RegexFlags);
Expand Down Expand Up @@ -80,6 +90,8 @@ public class ChineseDatePeriodExtractorConfiguration : BaseDateTimeOptionsConfig
YearAndMonth,
PureNumYearAndMonth,
YearInCJKRegex,
SpecialMonthRegex,
SpecialYearRegex,
WeekOfMonthRegex,
SeasonWithYear,
QuarterRegex,
Expand Down
Loading

0 comments on commit 3cf716e

Please sign in to comment.